From 714d81423e9948fcdb4e3eb948ec188ffd2ac131 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang@linux.dev>
Date: Mon, 22 Dec 2025 19:53:18 +0800
Subject: [PATCH 1/8] sched_ext: Avoid multiple irq_work_queue() calls in
 destroy_dsq()

llist_add() returns true only when adding to an empty list, which indicates
that no IRQ work is currently queued or running. Therefore, we only need to
call irq_work_queue() when llist_add() returns true, to avoid unnecessarily
re-queueing IRQ work that is already pending or executing.

Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8f6d8d7f895c..136b01950a62 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3439,8 +3439,8 @@ static void destroy_dsq(struct scx_sched *sch, u64 dsq_id)
 	 * operations inside scheduler locks.
 	 */
 	dsq->id = SCX_DSQ_INVALID;
-	llist_add(&dsq->free_node, &dsqs_to_free);
-	irq_work_queue(&free_dsq_irq_work);
+	if (llist_add(&dsq->free_node, &dsqs_to_free))
+		irq_work_queue(&free_dsq_irq_work);
 
 out_unlock_dsq:
 	raw_spin_unlock_irqrestore(&dsq->lock, flags);

From 2f8d489897ae7183b535b1881478b2c6b66d520b Mon Sep 17 00:00:00 2001
From: George Guo <guodongtai@kylinos.cn>
Date: Sat, 10 Jan 2026 00:12:14 +0800
Subject: [PATCH 2/8] sched_ext: Add error logging for dsq creation failures

Add scx_bpf_error() calls when scx_bpf_create_dsq() fails in multiple
schedulers to improve debuggability:

- scx_central.bpf.c: central_init()
- scx_flatcg.bpf.c: fcg_cgroup_init() and fcg_init()
- scx_qmap.bpf.c: qmap_init()

Signed-off-by: George Guo <guodongtai@kylinos.cn>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_central.bpf.c | 4 +++-
 tools/sched_ext/scx_flatcg.bpf.c  | 4 +++-
 tools/sched_ext/scx_qmap.bpf.c    | 8 ++++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 55df8b798865..1c2376b75b5d 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -301,8 +301,10 @@ int BPF_STRUCT_OPS_SLEEPABLE(central_init)
 	int ret;
 
 	ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
-	if (ret)
+	if (ret) {
+		scx_bpf_error("scx_bpf_create_dsq failed (%d)", ret);
 		return ret;
+	}
 
 	timer = bpf_map_lookup_elem(&central_timer, &key);
 	if (!timer)
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 43126858b8e4..c216480c3ee0 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -842,8 +842,10 @@ int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp,
 	 * unlikely case that it breaks.
 	 */
 	ret = scx_bpf_create_dsq(cgid, -1);
-	if (ret)
+	if (ret) {
+		scx_bpf_error("scx_bpf_create_dsq failed (%d)", ret);
 		return ret;
+	}
 
 	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0,
 				   BPF_LOCAL_STORAGE_GET_F_CREATE);
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index df21fad0c438..d51d8c38f1cf 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -866,12 +866,16 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 		print_cpus();
 
 	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
-	if (ret)
+	if (ret) {
+		scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
 		return ret;
+	}
 
 	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
-	if (ret)
+	if (ret) {
+		scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret);
 		return ret;
+	}
 
 	timer = bpf_map_lookup_elem(&monitor_timer, &key);
 	if (!timer)

From cc4448d0856d424e52b5f53b2592575598233eac Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Thu, 22 Jan 2026 22:26:03 -0500
Subject: [PATCH 3/8] tools/sched_ext: add scx_userland scheduler

Add in the scx_userland scheduler that does vruntime-based
scheduling in userspace code and communicates scheduling
decisions to BPF by accessing and modifying globals through
the skeleton.

Cc: Tejun Heo <tj@kernel.org>
Cc: David Vernet <dvernet@meta.com>
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/Makefile           |   2 +-
 tools/sched_ext/scx_userland.bpf.c | 344 +++++++++++++++++++++++
 tools/sched_ext/scx_userland.c     | 437 +++++++++++++++++++++++++++++
 tools/sched_ext/scx_userland.h     |  17 ++
 4 files changed, 799 insertions(+), 1 deletion(-)
 create mode 100644 tools/sched_ext/scx_userland.bpf.c
 create mode 100644 tools/sched_ext/scx_userland.c
 create mode 100644 tools/sched_ext/scx_userland.h

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index e4bda2474060..12043a82a1a9 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 
 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
 
-c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg
+c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland
 
 $(addprefix $(BINDIR)/,$(c-sched-targets)): \
 	$(BINDIR)/%: \
diff --git a/tools/sched_ext/scx_userland.bpf.c b/tools/sched_ext/scx_userland.bpf.c
new file mode 100644
index 000000000000..f29862b89386
--- /dev/null
+++ b/tools/sched_ext/scx_userland.bpf.c
@@ -0,0 +1,344 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A minimal userland scheduler.
+ *
+ * In terms of scheduling, this provides two different types of behaviors:
+ * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity.
+ *    All such tasks are direct-dispatched from the kernel, and are never
+ *    enqueued in user space.
+ * 2. A primitive vruntime scheduler that is implemented in user space, for all
+ *    other tasks.
+ *
+ * Some parts of this example user space scheduler could be implemented more
+ * efficiently using more complex and sophisticated data structures. For
+ * example, rather than using BPF_MAP_TYPE_QUEUE's,
+ * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between
+ * user space and kernel space. Similarly, we use a simple vruntime-sorted list
+ * in user space, but an rbtree could be used instead.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <scx/common.bpf.h>
+#include "scx_userland.h"
+
+/*
+ * Maximum amount of tasks enqueued/dispatched between kernel and user-space.
+ */
+#define MAX_ENQUEUED_TASKS 4096
+
+char _license[] SEC("license") = "GPL";
+
+const volatile s32 usersched_pid;
+
+/* !0 for veristat, set during init */
+const volatile u32 num_possible_cpus = 64;
+
+/* Stats that are printed by user space. */
+u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues;
+
+/*
+ * Number of tasks that are queued for scheduling.
+ *
+ * This number is incremented by the BPF component when a task is queued to the
+ * user-space scheduler and it must be decremented by the user-space scheduler
+ * when a task is consumed.
+ */
+volatile u64 nr_queued;
+
+/*
+ * Number of tasks that are waiting for scheduling.
+ *
+ * This number must be updated by the user-space scheduler to keep track if
+ * there is still some scheduling work to do.
+ */
+volatile u64 nr_scheduled;
+
+UEI_DEFINE(uei);
+
+/*
+ * The map containing tasks that are enqueued in user space from the kernel.
+ *
+ * This map is drained by the user space scheduler.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, MAX_ENQUEUED_TASKS);
+	__type(value, struct scx_userland_enqueued_task);
+} enqueued SEC(".maps");
+
+/*
+ * The map containing tasks that are dispatched to the kernel from user space.
+ *
+ * Drained by the kernel in userland_dispatch().
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, MAX_ENQUEUED_TASKS);
+	__type(value, s32);
+} dispatched SEC(".maps");
+
+/* Per-task scheduling context */
+struct task_ctx {
+	bool force_local; /* Dispatch directly to local DSQ */
+};
+
+/* Map that contains task-local storage. */
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+/*
+ * Flag used to wake-up the user-space scheduler.
+ */
+static volatile u32 usersched_needed;
+
+/*
+ * Set user-space scheduler wake-up flag (equivalent to an atomic release
+ * operation).
+ */
+static void set_usersched_needed(void)
+{
+	__sync_fetch_and_or(&usersched_needed, 1);
+}
+
+/*
+ * Check and clear user-space scheduler wake-up flag (equivalent to an atomic
+ * acquire operation).
+ */
+static bool test_and_clear_usersched_needed(void)
+{
+	return __sync_fetch_and_and(&usersched_needed, 0) == 1;
+}
+
+static bool is_usersched_task(const struct task_struct *p)
+{
+	return p->pid == usersched_pid;
+}
+
+static bool keep_in_kernel(const struct task_struct *p)
+{
+	return p->nr_cpus_allowed < num_possible_cpus;
+}
+
+static struct task_struct *usersched_task(void)
+{
+	struct task_struct *p;
+
+	p = bpf_task_from_pid(usersched_pid);
+	/*
+	 * Should never happen -- the usersched task should always be managed
+	 * by sched_ext.
+	 */
+	if (!p)
+		scx_bpf_error("Failed to find usersched task %d", usersched_pid);
+
+	return p;
+}
+
+s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	if (keep_in_kernel(p)) {
+		s32 cpu;
+		struct task_ctx *tctx;
+
+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("Failed to look up task-local storage for %s", p->comm);
+			return -ESRCH;
+		}
+
+		if (p->nr_cpus_allowed == 1 ||
+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+			tctx->force_local = true;
+			return prev_cpu;
+		}
+
+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+		if (cpu >= 0) {
+			tctx->force_local = true;
+			return cpu;
+		}
+	}
+
+	return prev_cpu;
+}
+
+static void dispatch_user_scheduler(void)
+{
+	struct task_struct *p;
+
+	p = usersched_task();
+	if (p) {
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	}
+}
+
+static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)
+{
+	struct scx_userland_enqueued_task task = {};
+
+	task.pid = p->pid;
+	task.sum_exec_runtime = p->se.sum_exec_runtime;
+	task.weight = p->scx.weight;
+
+	if (bpf_map_push_elem(&enqueued, &task, 0)) {
+		/*
+		 * If we fail to enqueue the task in user space, put it
+		 * directly on the global DSQ.
+		 */
+		__sync_fetch_and_add(&nr_failed_enqueues, 1);
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	} else {
+		__sync_fetch_and_add(&nr_user_enqueues, 1);
+		set_usersched_needed();
+	}
+}
+
+void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	if (keep_in_kernel(p)) {
+		u64 dsq_id = SCX_DSQ_GLOBAL;
+		struct task_ctx *tctx;
+
+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("Failed to lookup task ctx for %s", p->comm);
+			return;
+		}
+
+		if (tctx->force_local)
+			dsq_id = SCX_DSQ_LOCAL;
+		tctx->force_local = false;
+		scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags);
+		__sync_fetch_and_add(&nr_kernel_enqueues, 1);
+		return;
+	} else if (!is_usersched_task(p)) {
+		enqueue_task_in_user_space(p, enq_flags);
+	}
+}
+
+void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)
+{
+	if (test_and_clear_usersched_needed())
+		dispatch_user_scheduler();
+
+	bpf_repeat(MAX_ENQUEUED_TASKS) {
+		s32 pid;
+		struct task_struct *p;
+
+		if (bpf_map_pop_elem(&dispatched, &pid))
+			break;
+
+		/*
+		 * The task could have exited by the time we get around to
+		 * dispatching it. Treat this as a normal occurrence, and simply
+		 * move onto the next iteration.
+		 */
+		p = bpf_task_from_pid(pid);
+		if (!p)
+			continue;
+
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	}
+}
+
+/*
+ * A CPU is about to change its idle state. If the CPU is going idle, ensure
+ * that the user-space scheduler has a chance to run if there is any remaining
+ * work to do.
+ */
+void BPF_STRUCT_OPS(userland_update_idle, s32 cpu, bool idle)
+{
+	/*
+	 * Don't do anything if we exit from and idle state, a CPU owner will
+	 * be assigned in .running().
+	 */
+	if (!idle)
+		return;
+	/*
+	 * A CPU is now available, notify the user-space scheduler that tasks
+	 * can be dispatched, if there is at least one task waiting to be
+	 * scheduled, either queued (accounted in nr_queued) or scheduled
+	 * (accounted in nr_scheduled).
+	 *
+	 * NOTE: nr_queued is incremented by the BPF component, more exactly in
+	 * enqueue(), when a task is sent to the user-space scheduler, then
+	 * the scheduler drains the queued tasks (updating nr_queued) and adds
+	 * them to its internal data structures / state; at this point tasks
+	 * become "scheduled" and the user-space scheduler will take care of
+	 * updating nr_scheduled accordingly; lastly tasks will be dispatched
+	 * and the user-space scheduler will update nr_scheduled again.
+	 *
+	 * Checking both counters allows to determine if there is still some
+	 * pending work to do for the scheduler: new tasks have been queued
+	 * since last check, or there are still tasks "queued" or "scheduled"
+	 * since the previous user-space scheduler run. If the counters are
+	 * both zero it is pointless to wake-up the scheduler (even if a CPU
+	 * becomes idle), because there is nothing to do.
+	 *
+	 * Keep in mind that update_idle() doesn't run concurrently with the
+	 * user-space scheduler (that is single-threaded): this function is
+	 * naturally serialized with the user-space scheduler code, therefore
+	 * this check here is also safe from a concurrency perspective.
+	 */
+	if (nr_queued || nr_scheduled) {
+		/*
+		 * Kick the CPU to make it immediately ready to accept
+		 * dispatched tasks.
+		 */
+		set_usersched_needed();
+		scx_bpf_kick_cpu(cpu, 0);
+	}
+}
+
+s32 BPF_STRUCT_OPS(userland_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+s32 BPF_STRUCT_OPS(userland_init)
+{
+	if (num_possible_cpus == 0) {
+		scx_bpf_error("User scheduler # CPUs uninitialized (%d)",
+			      num_possible_cpus);
+		return -EINVAL;
+	}
+
+	if (usersched_pid <= 0) {
+		scx_bpf_error("User scheduler pid uninitialized (%d)",
+			      usersched_pid);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(userland_ops,
+	       .select_cpu		= (void *)userland_select_cpu,
+	       .enqueue			= (void *)userland_enqueue,
+	       .dispatch		= (void *)userland_dispatch,
+	       .update_idle		= (void *)userland_update_idle,
+	       .init_task		= (void *)userland_init_task,
+	       .init			= (void *)userland_init,
+	       .exit			= (void *)userland_exit,
+	       .flags			= SCX_OPS_ENQ_LAST |
+					  SCX_OPS_KEEP_BUILTIN_IDLE,
+	       .name			= "userland");
diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
new file mode 100644
index 000000000000..10b31020f44f
--- /dev/null
+++ b/tools/sched_ext/scx_userland.c
@@ -0,0 +1,437 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A demo sched_ext user space scheduler which provides vruntime semantics
+ * using a simple ordered-list implementation.
+ *
+ * Each CPU in the system resides in a single, global domain. This precludes
+ * the need to do any load balancing between domains. The scheduler could
+ * easily be extended to support multiple domains, with load balancing
+ * happening in user space.
+ *
+ * Any task which has any CPU affinity is scheduled entirely in BPF. This
+ * program only schedules tasks which may run on any CPU.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <sched.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <pthread.h>
+#include <bpf/bpf.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+
+#include <scx/common.h>
+#include "scx_userland.h"
+#include "scx_userland.bpf.skel.h"
+
+const char help_fmt[] =
+"A minimal userland sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Try to reduce `sysctl kernel.pid_max` if this program triggers OOMs.\n"
+"\n"
+"Usage: %s [-b BATCH]\n"
+"\n"
+"  -b BATCH      The number of tasks to batch when dispatching (default: 8)\n"
+"  -v            Print libbpf debug messages\n"
+"  -h            Display this help and exit\n";
+
+/* Defined in UAPI */
+#define SCHED_EXT 7
+
+/* Number of tasks to batch when dispatching to user space. */
+static __u32 batch_size = 8;
+
+static bool verbose;
+static volatile int exit_req;
+static int enqueued_fd, dispatched_fd;
+
+static struct scx_userland *skel;
+static struct bpf_link *ops_link;
+
+/* Stats collected in user space. */
+static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches, nr_vruntime_failed;
+
+/* Number of tasks currently enqueued. */
+static __u64 nr_curr_enqueued;
+
+/* The data structure containing tasks that are enqueued in user space. */
+struct enqueued_task {
+	LIST_ENTRY(enqueued_task) entries;
+	__u64 sum_exec_runtime;
+	double vruntime;
+};
+
+/*
+ * Use a vruntime-sorted list to store tasks. This could easily be extended to
+ * a more optimal data structure, such as an rbtree as is done in CFS. We
+ * currently elect to use a sorted list to simplify the example for
+ * illustrative purposes.
+ */
+LIST_HEAD(listhead, enqueued_task);
+
+/*
+ * A vruntime-sorted list of tasks. The head of the list contains the task with
+ * the lowest vruntime. That is, the task that has the "highest" claim to be
+ * scheduled.
+ */
+static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head);
+
+/*
+ * The main array of tasks. The array is allocated all at once during
+ * initialization, based on /proc/sys/kernel/pid_max, to avoid having to
+ * dynamically allocate memory on the enqueue path, which could cause a
+ * deadlock. A more substantive user space scheduler could e.g. provide a hook
+ * for newly enabled tasks that are passed to the scheduler from the
+ * .prep_enable() callback to allows the scheduler to allocate on safe paths.
+ */
+struct enqueued_task *tasks;
+static int pid_max;
+
+static double min_vruntime;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int userland)
+{
+	exit_req = 1;
+}
+
+static int get_pid_max(void)
+{
+	FILE *fp;
+	int pid_max;
+
+	fp = fopen("/proc/sys/kernel/pid_max", "r");
+	if (fp == NULL) {
+		fprintf(stderr, "Error opening /proc/sys/kernel/pid_max\n");
+		return -1;
+	}
+	if (fscanf(fp, "%d", &pid_max) != 1) {
+		fprintf(stderr, "Error reading from /proc/sys/kernel/pid_max\n");
+		fclose(fp);
+		return -1;
+	}
+	fclose(fp);
+
+	return pid_max;
+}
+
+static int init_tasks(void)
+{
+	pid_max = get_pid_max();
+	if (pid_max < 0)
+		return pid_max;
+
+	tasks = calloc(pid_max, sizeof(*tasks));
+	if (!tasks) {
+		fprintf(stderr, "Error allocating tasks array\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static __u32 task_pid(const struct enqueued_task *task)
+{
+	return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task);
+}
+
+static int dispatch_task(__s32 pid)
+{
+	int err;
+
+	err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0);
+	if (err) {
+		nr_vruntime_failed++;
+	} else {
+		nr_vruntime_dispatches++;
+	}
+
+	return err;
+}
+
+static struct enqueued_task *get_enqueued_task(__s32 pid)
+{
+	if (pid >= pid_max)
+		return NULL;
+
+	return &tasks[pid];
+}
+
+static double calc_vruntime_delta(__u64 weight, __u64 delta)
+{
+	double weight_f = (double)weight / 100.0;
+	double delta_f = (double)delta;
+
+	return delta_f / weight_f;
+}
+
+static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task)
+{
+	__u64 delta;
+
+	delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime;
+
+	enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta);
+	if (min_vruntime > enqueued->vruntime)
+		enqueued->vruntime = min_vruntime;
+	enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime;
+}
+
+static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task)
+{
+	struct enqueued_task *curr, *enqueued, *prev;
+
+	curr = get_enqueued_task(bpf_task->pid);
+	if (!curr)
+		return ENOENT;
+
+	update_enqueued(curr, bpf_task);
+	nr_vruntime_enqueues++;
+	nr_curr_enqueued++;
+
+	/*
+	 * Enqueue the task in a vruntime-sorted list. A more optimal data
+	 * structure such as an rbtree could easily be used as well. We elect
+	 * to use a list here simply because it's less code, and thus the
+	 * example is less convoluted and better serves to illustrate what a
+	 * user space scheduler could look like.
+	 */
+
+	if (LIST_EMPTY(&vruntime_head)) {
+		LIST_INSERT_HEAD(&vruntime_head, curr, entries);
+		return 0;
+	}
+
+	LIST_FOREACH(enqueued, &vruntime_head, entries) {
+		if (curr->vruntime <= enqueued->vruntime) {
+			LIST_INSERT_BEFORE(enqueued, curr, entries);
+			return 0;
+		}
+		prev = enqueued;
+	}
+
+	LIST_INSERT_AFTER(prev, curr, entries);
+
+	return 0;
+}
+
+static void drain_enqueued_map(void)
+{
+	while (1) {
+		struct scx_userland_enqueued_task task;
+		int err;
+
+		if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) {
+			skel->bss->nr_queued = 0;
+			skel->bss->nr_scheduled = nr_curr_enqueued;
+			return;
+		}
+
+		err = vruntime_enqueue(&task);
+		if (err) {
+			fprintf(stderr, "Failed to enqueue task %d: %s\n",
+				task.pid, strerror(err));
+			exit_req = 1;
+			return;
+		}
+	}
+}
+
+static void dispatch_batch(void)
+{
+	__u32 i;
+
+	for (i = 0; i < batch_size; i++) {
+		struct enqueued_task *task;
+		int err;
+		__s32 pid;
+
+		task = LIST_FIRST(&vruntime_head);
+		if (!task)
+			break;
+
+		min_vruntime = task->vruntime;
+		pid = task_pid(task);
+		LIST_REMOVE(task, entries);
+		err = dispatch_task(pid);
+		if (err) {
+			/*
+			 * If we fail to dispatch, put the task back to the
+			 * vruntime_head list and stop dispatching additional
+			 * tasks in this batch.
+			 */
+			LIST_INSERT_HEAD(&vruntime_head, task, entries);
+			break;
+		}
+		nr_curr_enqueued--;
+	}
+	skel->bss->nr_scheduled = nr_curr_enqueued;
+}
+
+static void *run_stats_printer(void *arg)
+{
+	while (!exit_req) {
+		__u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total;
+
+		nr_failed_enqueues = skel->bss->nr_failed_enqueues;
+		nr_kernel_enqueues = skel->bss->nr_kernel_enqueues;
+		nr_user_enqueues = skel->bss->nr_user_enqueues;
+		total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues;
+
+		printf("o-----------------------o\n");
+		printf("| BPF ENQUEUES          |\n");
+		printf("|-----------------------|\n");
+		printf("|  kern:     %10llu |\n", nr_kernel_enqueues);
+		printf("|  user:     %10llu |\n", nr_user_enqueues);
+		printf("|  failed:   %10llu |\n", nr_failed_enqueues);
+		printf("|  -------------------- |\n");
+		printf("|  total:    %10llu |\n", total);
+		printf("|                       |\n");
+		printf("|-----------------------|\n");
+		printf("| VRUNTIME / USER       |\n");
+		printf("|-----------------------|\n");
+		printf("|  enq:      %10llu |\n", nr_vruntime_enqueues);
+		printf("|  disp:     %10llu |\n", nr_vruntime_dispatches);
+		printf("|  failed:   %10llu |\n", nr_vruntime_failed);
+		printf("o-----------------------o\n");
+		printf("\n\n");
+		fflush(stdout);
+		sleep(1);
+	}
+
+	return NULL;
+}
+
+static int spawn_stats_thread(void)
+{
+	pthread_t stats_printer;
+
+	return pthread_create(&stats_printer, NULL, run_stats_printer, NULL);
+}
+
+static void pre_bootstrap(int argc, char **argv)
+{
+	int err;
+	__u32 opt;
+	struct sched_param sched_param = {
+		.sched_priority = sched_get_priority_max(SCHED_EXT),
+	};
+
+	err = init_tasks();
+	if (err)
+		exit(err);
+
+	libbpf_set_print(libbpf_print_fn);
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	/*
+	 * Enforce that the user scheduler task is managed by sched_ext. The
+	 * task eagerly drains the list of enqueued tasks in its main work
+	 * loop, and then yields the CPU. The BPF scheduler only schedules the
+	 * user space scheduler task when at least one other task in the system
+	 * needs to be scheduled.
+	 */
+	err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param);
+	SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT");
+
+	while ((opt = getopt(argc, argv, "b:vh")) != -1) {
+		switch (opt) {
+		case 'b':
+			batch_size = strtoul(optarg, NULL, 0);
+			break;
+		case 'v':
+			verbose = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			exit(opt != 'h');
+		}
+	}
+
+	/*
+	 * It's not always safe to allocate in a user space scheduler, as an
+	 * enqueued task could hold a lock that we require in order to be able
+	 * to allocate.
+	 */
+	err = mlockall(MCL_CURRENT | MCL_FUTURE);
+	SCX_BUG_ON(err, "Failed to prefault and lock address space");
+}
+
+static void bootstrap(char *comm)
+{
+	skel = SCX_OPS_OPEN(userland_ops, scx_userland);
+
+	skel->rodata->num_possible_cpus = libbpf_num_possible_cpus();
+	assert(skel->rodata->num_possible_cpus > 0);
+	skel->rodata->usersched_pid = getpid();
+	assert(skel->rodata->usersched_pid > 0);
+
+	SCX_OPS_LOAD(skel, userland_ops, scx_userland, uei);
+
+	enqueued_fd = bpf_map__fd(skel->maps.enqueued);
+	dispatched_fd = bpf_map__fd(skel->maps.dispatched);
+	assert(enqueued_fd > 0);
+	assert(dispatched_fd > 0);
+
+	SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread");
+
+	ops_link = SCX_OPS_ATTACH(skel, userland_ops, scx_userland);
+}
+
+static void sched_main_loop(void)
+{
+	while (!exit_req) {
+		/*
+		 * Perform the following work in the main user space scheduler
+		 * loop:
+		 *
+		 * 1. Drain all tasks from the enqueued map, and enqueue them
+		 *    to the vruntime sorted list.
+		 *
+		 * 2. Dispatch a batch of tasks from the vruntime sorted list
+		 *    down to the kernel.
+		 *
+		 * 3. Yield the CPU back to the system. The BPF scheduler will
+		 *    reschedule the user space scheduler once another task has
+		 *    been enqueued to user space.
+		 */
+		drain_enqueued_map();
+		dispatch_batch();
+		sched_yield();
+	}
+}
+
+int main(int argc, char **argv)
+{
+	__u64 ecode;
+
+	pre_bootstrap(argc, argv);
+restart:
+	bootstrap(argv[0]);
+	sched_main_loop();
+
+	exit_req = 1;
+	bpf_link__destroy(ops_link);
+	ecode = UEI_REPORT(skel, uei);
+	scx_userland__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
+	return 0;
+}
diff --git a/tools/sched_ext/scx_userland.h b/tools/sched_ext/scx_userland.h
new file mode 100644
index 000000000000..684fb2dd5de9
--- /dev/null
+++ b/tools/sched_ext/scx_userland.h
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta, Inc */
+
+#ifndef __SCX_USERLAND_COMMON_H
+#define __SCX_USERLAND_COMMON_H
+
+/*
+ * An instance of a task that has been enqueued by the kernel for consumption
+ * by a user space global scheduler thread.
+ */
+struct scx_userland_enqueued_task {
+	__s32 pid;
+	u64 sum_exec_runtime;
+	u64 weight;
+};
+
+#endif  // __SCX_USERLAND_COMMON_H

From f0262b102c7ce43f3744bdb0278ddf0d15bb1a71 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Thu, 22 Jan 2026 22:26:04 -0500
Subject: [PATCH 4/8] tools/sched_ext: add scx_pair scheduler

Add the scx_pair cgroup-based core scheduler.

Cc: Tejun Heo <tj@kernel.org>
Cc: David Vernet <dvernet@meta.com>
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/Makefile       |   2 +-
 tools/sched_ext/scx_pair.bpf.c | 610 +++++++++++++++++++++++++++++++++
 tools/sched_ext/scx_pair.c     | 180 ++++++++++
 tools/sched_ext/scx_pair.h     |   9 +
 4 files changed, 800 insertions(+), 1 deletion(-)
 create mode 100644 tools/sched_ext/scx_pair.bpf.c
 create mode 100644 tools/sched_ext/scx_pair.c
 create mode 100644 tools/sched_ext/scx_pair.h

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 12043a82a1a9..208e8f8fe4d8 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 
 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
 
-c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland
+c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair
 
 $(addprefix $(BINDIR)/,$(c-sched-targets)): \
 	$(BINDIR)/%: \
diff --git a/tools/sched_ext/scx_pair.bpf.c b/tools/sched_ext/scx_pair.bpf.c
new file mode 100644
index 000000000000..267011b57cba
--- /dev/null
+++ b/tools/sched_ext/scx_pair.bpf.c
@@ -0,0 +1,610 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A demo sched_ext core-scheduler which always makes every sibling CPU pair
+ * execute from the same CPU cgroup.
+ *
+ * This scheduler is a minimal implementation and would need some form of
+ * priority handling both inside each cgroup and across the cgroups to be
+ * practically useful.
+ *
+ * Each CPU in the system is paired with exactly one other CPU, according to a
+ * "stride" value that can be specified when the BPF scheduler program is first
+ * loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee
+ * that they will only ever schedule tasks that belong to the same CPU cgroup.
+ *
+ * Scheduler Initialization
+ * ------------------------
+ *
+ * The scheduler BPF program is first initialized from user space, before it is
+ * enabled. During this initialization process, each CPU on the system is
+ * assigned several values that are constant throughout its runtime:
+ *
+ * 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling
+ *		  decisions. Paired CPUs always schedule tasks from the same
+ *		  CPU cgroup, and synchronize with each other to guarantee
+ *		  that this constraint is not violated.
+ * 2. *Pair ID*:  Each CPU pair is assigned a Pair ID, which is used to access
+ *		  a struct pair_ctx object that is shared between the pair.
+ * 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the
+ *		       pair. Each struct pair_ctx has an active_mask field,
+ *		       which is a bitmap used to indicate whether each core
+ *		       in the pair currently has an actively running task.
+ *		       This index specifies which entry in the bitmap corresponds
+ *		       to each CPU in the pair.
+ *
+ * During this initialization, the CPUs are paired according to a "stride" that
+ * may be specified when invoking the user space program that initializes and
+ * loads the scheduler. By default, the stride is 1/2 the total number of CPUs.
+ *
+ * Tasks and cgroups
+ * -----------------
+ *
+ * Every cgroup in the system is registered with the scheduler using the
+ * pair_cgroup_init() callback, and every task in the system is associated with
+ * exactly one cgroup. At a high level, the idea with the pair scheduler is to
+ * always schedule tasks from the same cgroup within a given CPU pair. When a
+ * task is enqueued (i.e. passed to the pair_enqueue() callback function), its
+ * cgroup ID is read from its task struct, and then a corresponding queue map
+ * is used to FIFO-enqueue the task for that cgroup.
+ *
+ * If you look through the implementation of the scheduler, you'll notice that
+ * there is quite a bit of complexity involved with looking up the per-cgroup
+ * FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash
+ * BPF hash map that is used to map a cgroup ID to a globally unique ID that's
+ * allocated in the BPF program. This is done because we use separate maps to
+ * store the FIFO queue of tasks, and the length of that map, per cgroup. This
+ * complexity is only present because of current deficiencies in BPF that will
+ * soon be addressed. The main point to keep in mind is that newly enqueued
+ * tasks are added to their cgroup's FIFO queue.
+ *
+ * Dispatching tasks
+ * -----------------
+ *
+ * This section will describe how enqueued tasks are dispatched and scheduled.
+ * Tasks are dispatched in pair_dispatch(), and at a high level the workflow is
+ * as follows:
+ *
+ * 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is
+ *    the structure that's used to synchronize amongst the two pair CPUs in their
+ *    scheduling decisions. After any of the following events have occurred:
+ *
+ * - The cgroup's slice run has expired, or
+ * - The cgroup becomes empty, or
+ * - Either CPU in the pair is preempted by a higher priority scheduling class
+ *
+ * The cgroup transitions to the draining state and stops executing new tasks
+ * from the cgroup.
+ *
+ * 2. If the pair is still executing a task, mark the pair_ctx as draining, and
+ *    wait for the pair CPU to be preempted.
+ *
+ * 3. Otherwise, if the pair CPU is not running a task, we can move onto
+ *    scheduling new tasks. Pop the next cgroup id from the top_q queue.
+ *
+ * 4. Pop a task from that cgroup's FIFO task queue, and begin executing it.
+ *
+ * Note again that this scheduling behavior is simple, but the implementation
+ * is complex mostly because this it hits several BPF shortcomings and has to
+ * work around in often awkward ways. Most of the shortcomings are expected to
+ * be resolved in the near future which should allow greatly simplifying this
+ * scheduler.
+ *
+ * Dealing with preemption
+ * -----------------------
+ *
+ * SCX is the lowest priority sched_class, and could be preempted by them at
+ * any time. To address this, the scheduler implements pair_cpu_release() and
+ * pair_cpu_acquire() callbacks which are invoked by the core scheduler when
+ * the scheduler loses and gains control of the CPU respectively.
+ *
+ * In pair_cpu_release(), we mark the pair_ctx as having been preempted, and
+ * then invoke:
+ *
+ * scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT);
+ *
+ * This preempts the pair CPU, and waits until it has re-entered the scheduler
+ * before returning. This is necessary to ensure that the higher priority
+ * sched_class that preempted our scheduler does not schedule a task
+ * concurrently with our pair CPU.
+ *
+ * When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption
+ * in the pair_ctx, and send another resched IPI to the pair CPU to re-enable
+ * pair scheduling.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <scx/common.bpf.h>
+#include "scx_pair.h"
+
+char _license[] SEC("license") = "GPL";
+
+/* !0 for veristat, set during init */
+const volatile u32 nr_cpu_ids = 1;
+
+/* a pair of CPUs stay on a cgroup for this duration */
+const volatile u32 pair_batch_dur_ns;
+
+/* cpu ID -> pair cpu ID */
+const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu);
+
+/* cpu ID -> pair_id */
+const volatile u32 RESIZABLE_ARRAY(rodata, pair_id);
+
+/* CPU ID -> CPU # in the pair (0 or 1) */
+const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx);
+
+struct pair_ctx {
+	struct bpf_spin_lock	lock;
+
+	/* the cgroup the pair is currently executing */
+	u64			cgid;
+
+	/* the pair started executing the current cgroup at */
+	u64			started_at;
+
+	/* whether the current cgroup is draining */
+	bool			draining;
+
+	/* the CPUs that are currently active on the cgroup */
+	u32			active_mask;
+
+	/*
+	 * the CPUs that are currently preempted and running tasks in a
+	 * different scheduler.
+	 */
+	u32			preempted_mask;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct pair_ctx);
+} pair_ctx SEC(".maps");
+
+/* queue of cgrp_q's possibly with tasks on them */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	/*
+	 * Because it's difficult to build strong synchronization encompassing
+	 * multiple non-trivial operations in BPF, this queue is managed in an
+	 * opportunistic way so that we guarantee that a cgroup w/ active tasks
+	 * is always on it but possibly multiple times. Once we have more robust
+	 * synchronization constructs and e.g. linked list, we should be able to
+	 * do this in a prettier way but for now just size it big enough.
+	 */
+	__uint(max_entries, 4 * MAX_CGRPS);
+	__type(value, u64);
+} top_q SEC(".maps");
+
+/* per-cgroup q which FIFOs the tasks from the cgroup */
+struct cgrp_q {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, MAX_QUEUED);
+	__type(value, u32);
+};
+
+/*
+ * Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local
+ * storage; however, a cgroup local storage can only be accessed from the BPF
+ * progs attached to the cgroup. For now, work around by allocating array of
+ * cgrp_q's and then allocating per-cgroup indices.
+ *
+ * Another caveat: It's difficult to populate a large array of maps statically
+ * or from BPF. Initialize it from userland.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__uint(max_entries, MAX_CGRPS);
+	__type(key, s32);
+	__array(values, struct cgrp_q);
+} cgrp_q_arr SEC(".maps");
+
+static u64 cgrp_q_len[MAX_CGRPS];
+
+/*
+ * This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be
+ * useful to have as a map type.
+ */
+static u32 cgrp_q_idx_cursor;
+static u64 cgrp_q_idx_busy[MAX_CGRPS];
+
+/*
+ * All added up, the following is what we do:
+ *
+ * 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking
+ *    for a free ID. If not found, fail cgroup creation with -EBUSY.
+ *
+ * 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following
+ *    cgrp_q_idx_hash.
+ *
+ * 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from
+ *    cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr.
+ *
+ * This is sadly complicated for something pretty simple. Hopefully, we should
+ * be able to simplify in the future.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, MAX_CGRPS);
+	__uint(key_size, sizeof(u64));		/* cgrp ID */
+	__uint(value_size, sizeof(s32));	/* cgrp_q idx */
+} cgrp_q_idx_hash SEC(".maps");
+
+/* statistics */
+u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions;
+u64 nr_exps, nr_exp_waits, nr_exp_empty;
+u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty;
+
+UEI_DEFINE(uei);
+
+void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct cgroup *cgrp;
+	struct cgrp_q *cgq;
+	s32 pid = p->pid;
+	u64 cgid;
+	u32 *q_idx;
+	u64 *cgq_len;
+
+	__sync_fetch_and_add(&nr_total, 1);
+
+	cgrp = scx_bpf_task_cgroup(p);
+	cgid = cgrp->kn->id;
+	bpf_cgroup_release(cgrp);
+
+	/* find the cgroup's q and push @p into it */
+	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
+	if (!q_idx) {
+		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
+		return;
+	}
+
+	cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx);
+	if (!cgq) {
+		scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]",
+			      cgid, *q_idx);
+		return;
+	}
+
+	if (bpf_map_push_elem(cgq, &pid, 0)) {
+		scx_bpf_error("cgroup[%llu] queue overflow", cgid);
+		return;
+	}
+
+	/* bump q len, if going 0 -> 1, queue cgroup into the top_q */
+	cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
+	if (!cgq_len) {
+		scx_bpf_error("MEMBER_VTPR malfunction");
+		return;
+	}
+
+	if (!__sync_fetch_and_add(cgq_len, 1) &&
+	    bpf_map_push_elem(&top_q, &cgid, 0)) {
+		scx_bpf_error("top_q overflow");
+		return;
+	}
+}
+
+static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask)
+{
+	u32 *vptr;
+
+	vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids);
+	if (!vptr)
+		return -EINVAL;
+
+	*pairc = bpf_map_lookup_elem(&pair_ctx, vptr);
+	if (!(*pairc))
+		return -EINVAL;
+
+	vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids);
+	if (!vptr)
+		return -EINVAL;
+
+	*mask = 1U << *vptr;
+
+	return 0;
+}
+
+__attribute__((noinline))
+static int try_dispatch(s32 cpu)
+{
+	struct pair_ctx *pairc;
+	struct bpf_map *cgq_map;
+	struct task_struct *p;
+	u64 now = scx_bpf_now();
+	bool kick_pair = false;
+	bool expired, pair_preempted;
+	u32 *vptr, in_pair_mask;
+	s32 pid, q_idx;
+	u64 cgid;
+	int ret;
+
+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
+	if (ret) {
+		scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]",
+			      cpu);
+		return -ENOENT;
+	}
+
+	bpf_spin_lock(&pairc->lock);
+	pairc->active_mask &= ~in_pair_mask;
+
+	expired = time_before(pairc->started_at + pair_batch_dur_ns, now);
+	if (expired || pairc->draining) {
+		u64 new_cgid = 0;
+
+		__sync_fetch_and_add(&nr_exps, 1);
+
+		/*
+		 * We're done with the current cgid. An obvious optimization
+		 * would be not draining if the next cgroup is the current one.
+		 * For now, be dumb and always expire.
+		 */
+		pairc->draining = true;
+
+		pair_preempted = pairc->preempted_mask;
+		if (pairc->active_mask || pair_preempted) {
+			/*
+			 * The other CPU is still active, or is no longer under
+			 * our control due to e.g. being preempted by a higher
+			 * priority sched_class. We want to wait until this
+			 * cgroup expires, or until control of our pair CPU has
+			 * been returned to us.
+			 *
+			 * If the pair controls its CPU, and the time already
+			 * expired, kick.  When the other CPU arrives at
+			 * dispatch and clears its active mask, it'll push the
+			 * pair to the next cgroup and kick this CPU.
+			 */
+			__sync_fetch_and_add(&nr_exp_waits, 1);
+			bpf_spin_unlock(&pairc->lock);
+			if (expired && !pair_preempted)
+				kick_pair = true;
+			goto out_maybe_kick;
+		}
+
+		bpf_spin_unlock(&pairc->lock);
+
+		/*
+		 * Pick the next cgroup. It'd be easier / cleaner to not drop
+		 * pairc->lock and use stronger synchronization here especially
+		 * given that we'll be switching cgroups significantly less
+		 * frequently than tasks. Unfortunately, bpf_spin_lock can't
+		 * really protect anything non-trivial. Let's do opportunistic
+		 * operations instead.
+		 */
+		bpf_repeat(BPF_MAX_LOOPS) {
+			u32 *q_idx;
+			u64 *cgq_len;
+
+			if (bpf_map_pop_elem(&top_q, &new_cgid)) {
+				/* no active cgroup, go idle */
+				__sync_fetch_and_add(&nr_exp_empty, 1);
+				return 0;
+			}
+
+			q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid);
+			if (!q_idx)
+				continue;
+
+			/*
+			 * This is the only place where empty cgroups are taken
+			 * off the top_q.
+			 */
+			cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
+			if (!cgq_len || !*cgq_len)
+				continue;
+
+			/*
+			 * If it has any tasks, requeue as we may race and not
+			 * execute it.
+			 */
+			bpf_map_push_elem(&top_q, &new_cgid, 0);
+			break;
+		}
+
+		bpf_spin_lock(&pairc->lock);
+
+		/*
+		 * The other CPU may already have started on a new cgroup while
+		 * we dropped the lock. Make sure that we're still draining and
+		 * start on the new cgroup.
+		 */
+		if (pairc->draining && !pairc->active_mask) {
+			__sync_fetch_and_add(&nr_cgrp_next, 1);
+			pairc->cgid = new_cgid;
+			pairc->started_at = now;
+			pairc->draining = false;
+			kick_pair = true;
+		} else {
+			__sync_fetch_and_add(&nr_cgrp_coll, 1);
+		}
+	}
+
+	cgid = pairc->cgid;
+	pairc->active_mask |= in_pair_mask;
+	bpf_spin_unlock(&pairc->lock);
+
+	/* again, it'd be better to do all these with the lock held, oh well */
+	vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
+	if (!vptr) {
+		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
+		return -ENOENT;
+	}
+	q_idx = *vptr;
+
+	/* claim one task from cgrp_q w/ q_idx */
+	bpf_repeat(BPF_MAX_LOOPS) {
+		u64 *cgq_len, len;
+
+		cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]);
+		if (!cgq_len || !(len = *(volatile u64 *)cgq_len)) {
+			/* the cgroup must be empty, expire and repeat */
+			__sync_fetch_and_add(&nr_cgrp_empty, 1);
+			bpf_spin_lock(&pairc->lock);
+			pairc->draining = true;
+			pairc->active_mask &= ~in_pair_mask;
+			bpf_spin_unlock(&pairc->lock);
+			return -EAGAIN;
+		}
+
+		if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len)
+			continue;
+
+		break;
+	}
+
+	cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx);
+	if (!cgq_map) {
+		scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]",
+			      cgid, q_idx);
+		return -ENOENT;
+	}
+
+	if (bpf_map_pop_elem(cgq_map, &pid)) {
+		scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]",
+			      cgid, q_idx);
+		return -ENOENT;
+	}
+
+	p = bpf_task_from_pid(pid);
+	if (p) {
+		__sync_fetch_and_add(&nr_dispatched, 1);
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	} else {
+		/* we don't handle dequeues, retry on lost tasks */
+		__sync_fetch_and_add(&nr_missing, 1);
+		return -EAGAIN;
+	}
+
+out_maybe_kick:
+	if (kick_pair) {
+		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
+		if (pair) {
+			__sync_fetch_and_add(&nr_kicks, 1);
+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
+		}
+	}
+	return 0;
+}
+
+void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev)
+{
+	bpf_repeat(BPF_MAX_LOOPS) {
+		if (try_dispatch(cpu) != -EAGAIN)
+			break;
+	}
+}
+
+void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args)
+{
+	int ret;
+	u32 in_pair_mask;
+	struct pair_ctx *pairc;
+	bool kick_pair;
+
+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
+	if (ret)
+		return;
+
+	bpf_spin_lock(&pairc->lock);
+	pairc->preempted_mask &= ~in_pair_mask;
+	/* Kick the pair CPU, unless it was also preempted. */
+	kick_pair = !pairc->preempted_mask;
+	bpf_spin_unlock(&pairc->lock);
+
+	if (kick_pair) {
+		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
+
+		if (pair) {
+			__sync_fetch_and_add(&nr_kicks, 1);
+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
+		}
+	}
+}
+
+void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
+{
+	int ret;
+	u32 in_pair_mask;
+	struct pair_ctx *pairc;
+	bool kick_pair;
+
+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
+	if (ret)
+		return;
+
+	bpf_spin_lock(&pairc->lock);
+	pairc->preempted_mask |= in_pair_mask;
+	pairc->active_mask &= ~in_pair_mask;
+	/* Kick the pair CPU if it's still running. */
+	kick_pair = pairc->active_mask;
+	pairc->draining = true;
+	bpf_spin_unlock(&pairc->lock);
+
+	if (kick_pair) {
+		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
+
+		if (pair) {
+			__sync_fetch_and_add(&nr_kicks, 1);
+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT);
+		}
+	}
+	__sync_fetch_and_add(&nr_preemptions, 1);
+}
+
+s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp)
+{
+	u64 cgid = cgrp->kn->id;
+	s32 i, q_idx;
+
+	bpf_for(i, 0, MAX_CGRPS) {
+		q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS;
+		if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1))
+			break;
+	}
+	if (i == MAX_CGRPS)
+		return -EBUSY;
+
+	if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) {
+		u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]);
+		if (busy)
+			*busy = 0;
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp)
+{
+	u64 cgid = cgrp->kn->id;
+	s32 *q_idx;
+
+	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
+	if (q_idx) {
+		u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]);
+		if (busy)
+			*busy = 0;
+		bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid);
+	}
+}
+
+void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(pair_ops,
+	       .enqueue			= (void *)pair_enqueue,
+	       .dispatch		= (void *)pair_dispatch,
+	       .cpu_acquire		= (void *)pair_cpu_acquire,
+	       .cpu_release		= (void *)pair_cpu_release,
+	       .cgroup_init		= (void *)pair_cgroup_init,
+	       .cgroup_exit		= (void *)pair_cgroup_exit,
+	       .exit			= (void *)pair_exit,
+	       .name			= "pair");
diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
new file mode 100644
index 000000000000..d3e97faa6334
--- /dev/null
+++ b/tools/sched_ext/scx_pair.c
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_pair.h"
+#include "scx_pair.bpf.skel.h"
+
+const char help_fmt[] =
+"A demo sched_ext core-scheduler which always makes every sibling CPU pair\n"
+"execute from the same CPU cgroup.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-S STRIDE]\n"
+"\n"
+"  -S STRIDE     Override CPU pair stride (default: nr_cpus_ids / 2)\n"
+"  -v            Print libbpf debug messages\n"
+"  -h            Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int dummy)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_pair *skel;
+	struct bpf_link *link;
+	__u64 seq = 0, ecode;
+	__s32 stride, i, opt, outer_fd;
+
+	libbpf_set_print(libbpf_print_fn);
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+restart:
+	skel = SCX_OPS_OPEN(pair_ops, scx_pair);
+
+	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
+	assert(skel->rodata->nr_cpu_ids > 0);
+	skel->rodata->pair_batch_dur_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
+
+	/* pair up the earlier half to the latter by default, override with -s */
+	stride = skel->rodata->nr_cpu_ids / 2;
+
+	while ((opt = getopt(argc, argv, "S:vh")) != -1) {
+		switch (opt) {
+		case 'S':
+			stride = strtoul(optarg, NULL, 0);
+			break;
+		case 'v':
+			verbose = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2);
+
+	/* Resize arrays so their element count is equal to cpu count. */
+	RESIZE_ARRAY(skel, rodata, pair_cpu, skel->rodata->nr_cpu_ids);
+	RESIZE_ARRAY(skel, rodata, pair_id, skel->rodata->nr_cpu_ids);
+	RESIZE_ARRAY(skel, rodata, in_pair_idx, skel->rodata->nr_cpu_ids);
+
+	for (i = 0; i < skel->rodata->nr_cpu_ids; i++)
+		skel->rodata_pair_cpu->pair_cpu[i] = -1;
+
+	printf("Pairs: ");
+	for (i = 0; i < skel->rodata->nr_cpu_ids; i++) {
+		int j = (i + stride) % skel->rodata->nr_cpu_ids;
+
+		if (skel->rodata_pair_cpu->pair_cpu[i] >= 0)
+			continue;
+
+		SCX_BUG_ON(i == j,
+			   "Invalid stride %d - CPU%d wants to be its own pair",
+			   stride, i);
+
+		SCX_BUG_ON(skel->rodata_pair_cpu->pair_cpu[j] >= 0,
+			   "Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair",
+			   stride, i, j, skel->rodata_pair_cpu->pair_cpu[j]);
+
+		skel->rodata_pair_cpu->pair_cpu[i] = j;
+		skel->rodata_pair_cpu->pair_cpu[j] = i;
+		skel->rodata_pair_id->pair_id[i] = i;
+		skel->rodata_pair_id->pair_id[j] = i;
+		skel->rodata_in_pair_idx->in_pair_idx[i] = 0;
+		skel->rodata_in_pair_idx->in_pair_idx[j] = 1;
+
+		printf("[%d, %d] ", i, j);
+	}
+	printf("\n");
+
+	SCX_OPS_LOAD(skel, pair_ops, scx_pair, uei);
+
+	/*
+	 * Populate the cgrp_q_arr map which is an array containing per-cgroup
+	 * queues. It'd probably be better to do this from BPF but there are too
+	 * many to initialize statically and there's no way to dynamically
+	 * populate from BPF.
+	 */
+	outer_fd = bpf_map__fd(skel->maps.cgrp_q_arr);
+	SCX_BUG_ON(outer_fd < 0, "Failed to get outer_fd: %d", outer_fd);
+
+	printf("Initializing");
+        for (i = 0; i < MAX_CGRPS; i++) {
+		__s32 inner_fd;
+
+		if (exit_req)
+			break;
+
+		inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0,
+					  sizeof(__u32), MAX_QUEUED, NULL);
+		SCX_BUG_ON(inner_fd < 0, "Failed to get inner_fd: %d",
+			   inner_fd);
+		SCX_BUG_ON(bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY),
+			   "Failed to set inner map");
+		close(inner_fd);
+
+		if (!(i % 10))
+			printf(".");
+		fflush(stdout);
+        }
+	printf("\n");
+
+	/*
+	 * Fully initialized, attach and run.
+	 */
+	link = SCX_OPS_ATTACH(skel, pair_ops, scx_pair);
+
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
+		printf("[SEQ %llu]\n", seq++);
+		printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 "   missing:%10" PRIu64 "\n",
+		       skel->bss->nr_total,
+		       skel->bss->nr_dispatched,
+		       skel->bss->nr_missing);
+		printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n",
+		       skel->bss->nr_kicks,
+		       skel->bss->nr_preemptions);
+		printf("   exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n",
+		       skel->bss->nr_exps,
+		       skel->bss->nr_exp_waits,
+		       skel->bss->nr_exp_empty);
+		printf("cgnext:%10" PRIu64 "   cgcoll:%10" PRIu64 "   cgempty:%10" PRIu64 "\n",
+		       skel->bss->nr_cgrp_next,
+		       skel->bss->nr_cgrp_coll,
+		       skel->bss->nr_cgrp_empty);
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	ecode = UEI_REPORT(skel, uei);
+	scx_pair__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
+	return 0;
+}
diff --git a/tools/sched_ext/scx_pair.h b/tools/sched_ext/scx_pair.h
new file mode 100644
index 000000000000..d9666a447d3f
--- /dev/null
+++ b/tools/sched_ext/scx_pair.h
@@ -0,0 +1,9 @@
+#ifndef __SCX_EXAMPLE_PAIR_H
+#define __SCX_EXAMPLE_PAIR_H
+
+enum {
+	MAX_QUEUED		= 4096,
+	MAX_CGRPS		= 4096,
+};
+
+#endif /* __SCX_EXAMPLE_PAIR_H */

From 36929ebd17ae66ed3acde9056a9daf611d81a2e5 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Thu, 22 Jan 2026 22:26:05 -0500
Subject: [PATCH 5/8] tools/sched_ext: add arena based scheduler

Add a scheduler that uses BPF arenas to manage task context data.

Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/Makefile      |   2 +-
 tools/sched_ext/scx_sdt.bpf.c | 710 ++++++++++++++++++++++++++++++++++
 tools/sched_ext/scx_sdt.c     | 101 +++++
 tools/sched_ext/scx_sdt.h     | 113 ++++++
 4 files changed, 925 insertions(+), 1 deletion(-)
 create mode 100644 tools/sched_ext/scx_sdt.bpf.c
 create mode 100644 tools/sched_ext/scx_sdt.c
 create mode 100644 tools/sched_ext/scx_sdt.h

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 208e8f8fe4d8..47ad7444677e 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 
 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
 
-c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair
+c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair scx_sdt
 
 $(addprefix $(BINDIR)/,$(c-sched-targets)): \
 	$(BINDIR)/%: \
diff --git a/tools/sched_ext/scx_sdt.bpf.c b/tools/sched_ext/scx_sdt.bpf.c
new file mode 100644
index 000000000000..48ea18614e28
--- /dev/null
+++ b/tools/sched_ext/scx_sdt.bpf.c
@@ -0,0 +1,710 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Arena-based task data scheduler. This is a variation of scx_simple
+ * that uses a combined allocator and indexing structure to organize
+ * task data. Task context allocation is done when a task enters the
+ * scheduler, while freeing is done when it exits. Task contexts are
+ * retrieved from task-local storage, pointing to the allocated memory.
+ *
+ * The main purpose of this scheduler is to demostrate arena memory
+ * management.
+ *
+ * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024-2025 Emil Tsalapatis <etsal@meta.com>
+ * Copyright (c) 2024-2025 Tejun Heo <tj@kernel.org>
+ *
+ */
+#include <scx/common.bpf.h>
+#include <scx/bpf_arena_common.bpf.h>
+
+#include "scx_sdt.h"
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARENA);
+	__uint(map_flags, BPF_F_MMAPABLE);
+#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
+	__uint(max_entries, 1 << 16); /* number of pages */
+        __ulong(map_extra, (1ull << 32)); /* start of mmap() region */
+#else
+	__uint(max_entries, 1 << 20); /* number of pages */
+        __ulong(map_extra, (1ull << 44)); /* start of mmap() region */
+#endif
+} arena __weak SEC(".maps");
+
+#define SHARED_DSQ 0
+
+#define DEFINE_SDT_STAT(metric)				\
+static inline void				\
+stat_inc_##metric(struct scx_stats __arena *stats)	\
+{							\
+	cast_kern(stats);				\
+	stats->metric += 1;				\
+}							\
+__u64 stat_##metric;					\
+
+DEFINE_SDT_STAT(enqueue);
+DEFINE_SDT_STAT(init);
+DEFINE_SDT_STAT(exit);
+DEFINE_SDT_STAT(select_idle_cpu);
+DEFINE_SDT_STAT(select_busy_cpu);
+
+/*
+ * Necessary for cond_break/can_loop's semantics. According to kernel commit
+ * 011832b, the loop counter variable must be seen as imprecise and bounded
+ * by the verifier. Initializing it from a constant (e.g., i = 0;), then,
+ * makes it precise and prevents may_goto from helping with converging the
+ * loop. For these loops we must initialize the loop counter from a variable
+ * whose value the verifier cannot reason about when checking the program, so
+ * that the loop counter's value is imprecise.
+ */
+static __u64 zero = 0;
+
+/*
+ * XXX Hack to get the verifier to find the arena for sdt_exit_task.
+ * As of 6.12-rc5, The verifier associates arenas with programs by
+ * checking LD.IMM instruction operands for an arena and populating
+ * the program state with the first instance it finds. This requires
+ * accessing our global arena variable, but scx methods do not necessarily
+ * do so while still using pointers from that arena. Insert a bpf_printk
+ * statement that triggers at most once to generate an LD.IMM instruction
+ * to access the arena and help the verifier.
+ */
+static volatile bool scx_arena_verify_once;
+
+__hidden void scx_arena_subprog_init(void)
+{
+	if (scx_arena_verify_once)
+		return;
+
+	bpf_printk("%s: arena pointer %p", __func__, &arena);
+	scx_arena_verify_once = true;
+}
+
+
+private(LOCK) struct bpf_spin_lock alloc_lock;
+private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock;
+
+/* allocation pools */
+struct sdt_pool desc_pool;
+struct sdt_pool chunk_pool;
+
+/* Protected by alloc_lock. */
+struct scx_alloc_stats alloc_stats;
+
+
+/* Allocate element from the pool. Must be called with a then pool lock held. */
+static
+void __arena *scx_alloc_from_pool(struct sdt_pool *pool)
+{
+	__u64 elem_size, max_elems;
+	void __arena *slab;
+	void __arena *ptr;
+
+	elem_size = pool->elem_size;
+	max_elems = pool->max_elems;
+
+	/* If the chunk is spent, get a new one. */
+	if (pool->idx >= max_elems) {
+		slab = bpf_arena_alloc_pages(&arena, NULL,
+			div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0);
+		if (!slab)
+			return NULL;
+
+		pool->slab = slab;
+		pool->idx = 0;
+	}
+
+	ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx);
+	pool->idx += 1;
+
+	return ptr;
+}
+
+/* Alloc desc and associated chunk. Called with the allocator spinlock held. */
+static sdt_desc_t *scx_alloc_chunk(void)
+{
+	struct sdt_chunk __arena *chunk;
+	sdt_desc_t *desc;
+	sdt_desc_t *out;
+
+	chunk = scx_alloc_from_pool(&chunk_pool);
+	if (!chunk)
+		return NULL;
+
+	desc = scx_alloc_from_pool(&desc_pool);
+	if (!desc) {
+		/*
+		 * Effectively frees the previous chunk allocation.
+		 * Index cannot be 0, so decrementing is always
+		 * valid.
+		 */
+		chunk_pool.idx -= 1;
+		return NULL;
+	}
+
+	out = desc;
+
+	desc->nr_free = SDT_TASK_ENTS_PER_CHUNK;
+	desc->chunk = chunk;
+
+	alloc_stats.chunk_allocs += 1;
+
+	return out;
+}
+
+static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages)
+{
+	if (unlikely(data_size % 8))
+		return -EINVAL;
+
+	if (unlikely(nr_pages == 0))
+		return -EINVAL;
+
+	pool->elem_size = data_size;
+	pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size;
+	/* Populate the pool slab on the first allocation. */
+	pool->idx = pool->max_elems;
+
+	return 0;
+}
+
+/* Initialize both the base pool allocators and the root chunk of the index. */
+__hidden int
+scx_alloc_init(struct scx_allocator *alloc, __u64 data_size)
+{
+	size_t min_chunk_size;
+	int ret;
+
+	_Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE,
+		"chunk size must fit into a page");
+
+	ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1);
+	if (ret != 0)
+		return ret;
+
+	ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1);
+	if (ret != 0)
+		return ret;
+
+	/* Wrap data into a descriptor and word align. */
+	data_size += sizeof(struct sdt_data);
+	data_size = round_up(data_size, 8);
+
+	/*
+	 * Ensure we allocate large enough chunks from the arena to avoid excessive
+	 * internal fragmentation when turning chunks it into structs.
+	 */
+	min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE);
+	ret = pool_set_size(&alloc->pool, data_size, min_chunk_size);
+	if (ret != 0)
+		return ret;
+
+	bpf_spin_lock(&alloc_lock);
+	alloc->root = scx_alloc_chunk();
+	bpf_spin_unlock(&alloc_lock);
+	if (!alloc->root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static
+int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state)
+{
+	__u64 __arena *allocated = desc->allocated;
+	__u64 bit;
+
+	if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK))
+		return -EINVAL;
+
+	bit = (__u64)1 << (pos % 64);
+
+	if (state)
+		allocated[pos / 64] |= bit;
+	else
+		allocated[pos / 64] &= ~bit;
+
+	return 0;
+}
+
+static __noinline
+int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS])
+{
+	sdt_desc_t *desc;
+	__u64 u, level;
+	int ret;
+
+	for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) {
+		level = SDT_TASK_LEVELS - 1 - u;
+
+		/* Only propagate upwards if we are the parent's only free chunk. */
+		desc = lv_desc[level];
+
+		ret = set_idx_state(desc, lv_pos[level], false);
+		if (unlikely(ret != 0))
+			return ret;
+
+		desc->nr_free += 1;
+		if (desc->nr_free > 1)
+			return 0;
+	}
+
+	return 0;
+}
+
+/*
+ * Free the allocated struct with the given index. Called with the
+ * allocator lock taken.
+ */
+__hidden
+int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx)
+{
+	const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1;
+	sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
+	sdt_desc_t * __arena *desc_children;
+	struct sdt_chunk __arena *chunk;
+	sdt_desc_t *desc;
+	struct sdt_data __arena *data;
+	__u64 level, shift, pos;
+	__u64 lv_pos[SDT_TASK_LEVELS];
+	int ret;
+	int i;
+
+	if (!alloc)
+		return 0;
+
+	desc = alloc->root;
+	if (unlikely(!desc))
+		return -EINVAL;
+
+	/* To appease the verifier. */
+	for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
+		lv_desc[level] = NULL;
+		lv_pos[level] = 0;
+	}
+
+	/* Find the leaf node containing the index. */
+	for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
+		shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT;
+		pos = (idx >> shift) & mask;
+
+		lv_desc[level] = desc;
+		lv_pos[level] = pos;
+
+		if (level == SDT_TASK_LEVELS - 1)
+			break;
+
+		chunk = desc->chunk;
+
+		desc_children = (sdt_desc_t * __arena *)chunk->descs;
+		desc = desc_children[pos];
+
+		if (unlikely(!desc))
+			return -EINVAL;
+	}
+
+	chunk = desc->chunk;
+
+	pos = idx & mask;
+	data = chunk->data[pos];
+	if (likely(data)) {
+		data[pos] = (struct sdt_data) {
+			.tid.genn = data->tid.genn + 1,
+		};
+
+		/* Zero out one word at a time. */
+		for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) {
+			data->payload[i] = 0;
+		}
+	}
+
+	ret = mark_nodes_avail(lv_desc, lv_pos);
+	if (unlikely(ret != 0))
+		return ret;
+
+	alloc_stats.active_allocs -= 1;
+	alloc_stats.free_ops += 1;
+
+	return 0;
+}
+
+static inline
+int ffs(__u64 word)
+{
+	unsigned int num = 0;
+
+	if ((word & 0xffffffff) == 0) {
+		num += 32;
+		word >>= 32;
+	}
+
+	if ((word & 0xffff) == 0) {
+		num += 16;
+		word >>= 16;
+	}
+
+	if ((word & 0xff) == 0) {
+		num += 8;
+		word >>= 8;
+	}
+
+	if ((word & 0xf) == 0) {
+		num += 4;
+		word >>= 4;
+	}
+
+	if ((word & 0x3) == 0) {
+		num += 2;
+		word >>= 2;
+	}
+
+	if ((word & 0x1) == 0) {
+		num += 1;
+		word >>= 1;
+	}
+
+	return num;
+}
+
+
+/* find the first empty slot */
+__hidden
+__u64 chunk_find_empty(sdt_desc_t __arg_arena *desc)
+{
+	__u64 freeslots;
+	__u64 i;
+
+	for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) {
+		freeslots = ~desc->allocated[i];
+		if (freeslots == (__u64)0)
+			continue;
+
+		return (i * 64) + ffs(freeslots);
+	}
+
+	return SDT_TASK_ENTS_PER_CHUNK;
+}
+
+/*
+ * Find and return an available idx on the allocator.
+ * Called with the task spinlock held.
+ */
+static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp)
+{
+	sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
+	sdt_desc_t * __arena *desc_children;
+	struct sdt_chunk __arena *chunk;
+	sdt_desc_t *tmp;
+	__u64 lv_pos[SDT_TASK_LEVELS];
+	__u64 u, pos, level;
+	__u64 idx = 0;
+	int ret;
+
+	for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
+		pos = chunk_find_empty(desc);
+
+		/* If we error out, something has gone very wrong. */
+		if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK))
+			return NULL;
+
+		if (pos == SDT_TASK_ENTS_PER_CHUNK)
+			return NULL;
+
+		idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT;
+		idx |= pos;
+
+		/* Log the levels to complete allocation. */
+		lv_desc[level] = desc;
+		lv_pos[level] = pos;
+
+		/* The rest of the loop is for internal node traversal. */
+		if (level == SDT_TASK_LEVELS - 1)
+			break;
+
+		/* Allocate an internal node if necessary. */
+		chunk = desc->chunk;
+		desc_children = (sdt_desc_t * __arena *)chunk->descs;
+
+		desc = desc_children[pos];
+		if (!desc) {
+			desc = scx_alloc_chunk();
+			if (!desc)
+				return NULL;
+
+			desc_children[pos] = desc;
+		}
+	}
+
+	/*
+	 * Finding the descriptor along with any internal node
+	 * allocations was successful. Update all levels with
+	 * the new allocation.
+	 */
+	bpf_for(u, 0, SDT_TASK_LEVELS) {
+		level = SDT_TASK_LEVELS - 1 - u;
+		tmp = lv_desc[level];
+
+		ret = set_idx_state(tmp, lv_pos[level], true);
+		if (ret != 0)
+			break;
+
+		tmp->nr_free -= 1;
+		if (tmp->nr_free > 0)
+			break;
+	}
+
+	*idxp = idx;
+
+	return desc;
+}
+
+__hidden
+void __arena *scx_alloc(struct scx_allocator *alloc)
+{
+	struct sdt_data __arena *data = NULL;
+	struct sdt_chunk __arena *chunk;
+	sdt_desc_t *desc;
+	__u64 idx, pos;
+
+	if (!alloc)
+		return NULL;
+
+	bpf_spin_lock(&alloc_lock);
+
+	/* We unlock if we encounter an error in the function. */
+	desc = desc_find_empty(alloc->root, &idx);
+	if (unlikely(desc == NULL)) {
+		bpf_spin_unlock(&alloc_lock);
+		return NULL;
+	}
+
+	chunk = desc->chunk;
+
+	/* Populate the leaf node if necessary. */
+	pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1);
+	data = chunk->data[pos];
+	if (!data) {
+		data = scx_alloc_from_pool(&alloc->pool);
+		if (!data) {
+			scx_alloc_free_idx(alloc, idx);
+			bpf_spin_unlock(&alloc_lock);
+			return NULL;
+		}
+	}
+
+	chunk->data[pos] = data;
+
+	/* The data counts as a chunk */
+	alloc_stats.data_allocs += 1;
+	alloc_stats.alloc_ops += 1;
+	alloc_stats.active_allocs += 1;
+
+	data->tid.idx = idx;
+
+	bpf_spin_unlock(&alloc_lock);
+
+	return data;
+}
+
+/*
+ * Task BPF map entry recording the task's assigned ID and pointing to the data
+ * area allocated in arena.
+ */
+struct scx_task_map_val {
+	union sdt_id		tid;
+	__u64			tptr;
+	struct sdt_data __arena	*data;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct scx_task_map_val);
+} scx_task_map SEC(".maps");
+
+static struct scx_allocator scx_task_allocator;
+
+__hidden
+void __arena *scx_task_alloc(struct task_struct *p)
+{
+	struct sdt_data __arena *data = NULL;
+	struct scx_task_map_val *mval;
+
+	mval = bpf_task_storage_get(&scx_task_map, p, 0,
+				    BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!mval)
+		return NULL;
+
+	data = scx_alloc(&scx_task_allocator);
+	if (unlikely(!data))
+		return NULL;
+
+	mval->tid = data->tid;
+	mval->tptr = (__u64) p;
+	mval->data = data;
+
+	return (void __arena *)data->payload;
+}
+
+__hidden
+int scx_task_init(__u64 data_size)
+{
+	return scx_alloc_init(&scx_task_allocator, data_size);
+}
+
+__hidden
+void __arena *scx_task_data(struct task_struct *p)
+{
+	struct sdt_data __arena *data;
+	struct scx_task_map_val *mval;
+
+	scx_arena_subprog_init();
+
+	mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
+	if (!mval)
+		return NULL;
+
+	data = mval->data;
+
+	return (void __arena *)data->payload;
+}
+
+__hidden
+void scx_task_free(struct task_struct *p)
+{
+	struct scx_task_map_val *mval;
+
+	scx_arena_subprog_init();
+
+	mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
+	if (!mval)
+		return;
+
+	bpf_spin_lock(&alloc_lock);
+	scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx);
+	bpf_spin_unlock(&alloc_lock);
+
+	bpf_task_storage_delete(&scx_task_map, p);
+}
+
+static inline void
+scx_stat_global_update(struct scx_stats __arena *stats)
+{
+	cast_kern(stats);
+	__sync_fetch_and_add(&stat_enqueue, stats->enqueue);
+	__sync_fetch_and_add(&stat_init, stats->init);
+	__sync_fetch_and_add(&stat_exit, stats->exit);
+	__sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu);
+	__sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu);
+}
+
+s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+	struct scx_stats __arena *stats;
+	bool is_idle = false;
+	s32 cpu;
+
+	stats = scx_task_data(p);
+	if (!stats) {
+		scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
+		return 0;
+	}
+
+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
+	if (is_idle) {
+		stat_inc_select_idle_cpu(stats);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+	} else {
+		stat_inc_select_busy_cpu(stats);
+	}
+
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct scx_stats __arena *stats;
+
+	stats = scx_task_data(p);
+	if (!stats) {
+		scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
+		return;
+	}
+
+	stat_inc_enqueue(stats);
+
+	scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev)
+{
+	scx_bpf_dsq_move_to_local(SHARED_DSQ);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p,
+			     struct scx_init_task_args *args)
+{
+	struct scx_stats __arena *stats;
+
+	stats = scx_task_alloc(p);
+	if (!stats) {
+		scx_bpf_error("arena allocator out of memory");
+		return -ENOMEM;
+	}
+
+	stats->pid = p->pid;
+
+	stat_inc_init(stats);
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p,
+			      struct scx_exit_task_args *args)
+{
+	struct scx_stats __arena *stats;
+
+	stats = scx_task_data(p);
+	if (!stats) {
+		scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
+		return;
+	}
+
+	stat_inc_exit(stats);
+	scx_stat_global_update(stats);
+
+	scx_task_free(p);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
+{
+	int ret;
+
+	ret = scx_task_init(sizeof(struct scx_stats));
+	if (ret < 0) {
+		scx_bpf_error("%s: failed with %d", __func__, ret);
+		return ret;
+	}
+
+	return scx_bpf_create_dsq(SHARED_DSQ, -1);
+}
+
+void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(sdt_ops,
+	       .select_cpu		= (void *)sdt_select_cpu,
+	       .enqueue			= (void *)sdt_enqueue,
+	       .dispatch		= (void *)sdt_dispatch,
+	       .init_task		= (void *)sdt_init_task,
+	       .exit_task		= (void *)sdt_exit_task,
+	       .init			= (void *)sdt_init,
+	       .exit			= (void *)sdt_exit,
+	       .name			= "sdt");
diff --git a/tools/sched_ext/scx_sdt.c b/tools/sched_ext/scx_sdt.c
new file mode 100644
index 000000000000..b0363363476d
--- /dev/null
+++ b/tools/sched_ext/scx_sdt.c
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+
+#include "scx_sdt.h"
+#include "scx_sdt.bpf.skel.h"
+
+const char help_fmt[] =
+"A simple arena-based sched_ext scheduler.\n"
+"\n"
+"Modified version of scx_simple that demonstrates arena-based data structures.\n"
+"\n"
+"Usage: %s [-f] [-v]\n"
+"\n"
+"  -v            Print libbpf debug messages\n"
+"  -h            Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int sig)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_sdt *skel;
+	struct bpf_link *link;
+	__u32 opt;
+	__u64 ecode;
+
+	libbpf_set_print(libbpf_print_fn);
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+restart:
+	skel = SCX_OPS_OPEN(sdt_ops, scx_sdt);
+
+	while ((opt = getopt(argc, argv, "fvh")) != -1) {
+		switch (opt) {
+		case 'v':
+			verbose = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	SCX_OPS_LOAD(skel, sdt_ops, scx_sdt, uei);
+	link = SCX_OPS_ATTACH(skel, sdt_ops, scx_sdt);
+
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
+		printf("====SCHEDULING STATS====\n");
+		printf("enqueues=%llu\t", skel->bss->stat_enqueue);
+		printf("inits=%llu\t", skel->bss->stat_init);
+		printf("exits=%llu\t", skel->bss->stat_exit);
+		printf("\n");
+
+		printf("select_idle_cpu=%llu\t", skel->bss->stat_select_idle_cpu);
+		printf("select_busy_cpu=%llu\t", skel->bss->stat_select_busy_cpu);
+		printf("\n");
+
+		printf("====ALLOCATION STATS====\n");
+		printf("chunk allocs=%llu\t", skel->bss->alloc_stats.chunk_allocs);
+		printf("data_allocs=%llu\n", skel->bss->alloc_stats.data_allocs);
+		printf("alloc_ops=%llu\t", skel->bss->alloc_stats.alloc_ops);
+		printf("free_ops=%llu\t", skel->bss->alloc_stats.free_ops);
+		printf("active_allocs=%llu\t", skel->bss->alloc_stats.active_allocs);
+		printf("arena_pages_used=%llu\t", skel->bss->alloc_stats.arena_pages_used);
+		printf("\n\n");
+
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	ecode = UEI_REPORT(skel, uei);
+	scx_sdt__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
+	return 0;
+}
diff --git a/tools/sched_ext/scx_sdt.h b/tools/sched_ext/scx_sdt.h
new file mode 100644
index 000000000000..67982ce9bc9b
--- /dev/null
+++ b/tools/sched_ext/scx_sdt.h
@@ -0,0 +1,113 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2025 Emil Tsalapatis <etsal@meta.com>
+ */
+#pragma once
+
+#ifndef __BPF__
+#define __arena
+#endif /* __BPF__ */
+
+struct scx_alloc_stats {
+	__u64		chunk_allocs;
+	__u64		data_allocs;
+	__u64		alloc_ops;
+	__u64		free_ops;
+	__u64		active_allocs;
+	__u64		arena_pages_used;
+};
+
+struct sdt_pool {
+	void __arena	*slab;
+	__u64		elem_size;
+	__u64		max_elems;
+	__u64		idx;
+};
+
+#ifndef div_round_up
+#define div_round_up(a, b) (((a) + (b) - 1) / (b))
+#endif
+
+#ifndef round_up
+#define round_up(a, b) (div_round_up((a), (b)) * (b))
+#endif
+
+typedef struct sdt_desc __arena sdt_desc_t;
+
+enum sdt_consts {
+	SDT_TASK_ENTS_PER_PAGE_SHIFT	= 9,
+	SDT_TASK_LEVELS			= 3,
+	SDT_TASK_ENTS_PER_CHUNK		= 1 << SDT_TASK_ENTS_PER_PAGE_SHIFT,
+	SDT_TASK_CHUNK_BITMAP_U64S	= div_round_up(SDT_TASK_ENTS_PER_CHUNK, 64),
+	SDT_TASK_MIN_ELEM_PER_ALLOC 	= 8,
+};
+
+union sdt_id {
+	__s64				val;
+	struct {
+		__s32			idx;	/* index in the radix tree */
+		__s32			genn;	/* ++'d on recycle so that it forms unique'ish 64bit ID */
+	};
+};
+
+struct sdt_chunk;
+
+/*
+ * Each index page is described by the following descriptor which carries the
+ * bitmap. This way the actual index can host power-of-two numbers of entries
+ * which makes indexing cheaper.
+ */
+struct sdt_desc {
+	__u64				allocated[SDT_TASK_CHUNK_BITMAP_U64S];
+	__u64				nr_free;
+	struct sdt_chunk __arena	*chunk;
+};
+
+/*
+ * Leaf node containing per-task data.
+ */
+struct sdt_data {
+	union sdt_id			tid;
+	__u64				payload[];
+};
+
+/*
+ * Intermediate node pointing to another intermediate node or leaf node.
+ */
+struct sdt_chunk {
+	union {
+		sdt_desc_t * descs[SDT_TASK_ENTS_PER_CHUNK];
+		struct sdt_data __arena *data[SDT_TASK_ENTS_PER_CHUNK];
+	};
+};
+
+struct scx_allocator {
+	struct sdt_pool	pool;
+	sdt_desc_t	*root;
+};
+
+struct scx_stats {
+	int	seq;
+	pid_t	pid;
+	__u64	enqueue;
+	__u64	exit;
+	__u64	init;
+	__u64	select_busy_cpu;
+	__u64	select_idle_cpu;
+};
+
+#ifdef __BPF__
+
+void __arena *scx_task_data(struct task_struct *p);
+int scx_task_init(__u64 data_size);
+void __arena *scx_task_alloc(struct task_struct *p);
+void scx_task_free(struct task_struct *p);
+void scx_arena_subprog_init(void);
+
+int scx_alloc_init(struct scx_allocator *alloc, __u64 data_size);
+u64 scx_alloc_internal(struct scx_allocator *alloc);
+int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx);
+
+#endif /* __BPF__ */

From bd4f0822f4ecd98285208e3322ad23c2ead0c878 Mon Sep 17 00:00:00 2001
From: zhidao su <soolaugust@gmail.com>
Date: Fri, 30 Jan 2026 18:55:11 +0800
Subject: [PATCH 6/8] tools/sched_ext: Add error logging for dsq creation
 failures in remaining schedulers

Add scx_bpf_error() calls when scx_bpf_create_dsq() fails in the remaining
schedulers to improve debuggability:

- scx_simple.bpf.c: simple_init()
- scx_sdt.bpf.c: sdt_init()
- scx_cpu0.bpf.c: cpu0_init()
- scx_flatcg.bpf.c: fcg_init()

This follows the same pattern established in commit 2f8d489897ae
("sched_ext: Add error logging for dsq creation failures") for other
schedulers and ensures consistent error reporting across all schedulers.

Signed-off-by: zhidao su <suzhidao@xiaomi.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_cpu0.bpf.c   | 10 +++++++++-
 tools/sched_ext/scx_flatcg.bpf.c | 10 +++++++++-
 tools/sched_ext/scx_sdt.bpf.c    |  8 +++++++-
 tools/sched_ext/scx_simple.bpf.c | 10 +++++++++-
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/tools/sched_ext/scx_cpu0.bpf.c b/tools/sched_ext/scx_cpu0.bpf.c
index 6326ce598c8e..9b67ab11b04c 100644
--- a/tools/sched_ext/scx_cpu0.bpf.c
+++ b/tools/sched_ext/scx_cpu0.bpf.c
@@ -71,7 +71,15 @@ void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev)
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init)
 {
-	return scx_bpf_create_dsq(DSQ_CPU0, -1);
+	int ret;
+
+	ret = scx_bpf_create_dsq(DSQ_CPU0, -1);
+	if (ret) {
+		scx_bpf_error("failed to create DSQ %d (%d)", DSQ_CPU0, ret);
+		return ret;
+	}
+
+	return 0;
 }
 
 void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei)
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index c216480c3ee0..0e785cff0f24 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -929,7 +929,15 @@ void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init)
 {
-	return scx_bpf_create_dsq(FALLBACK_DSQ, -1);
+	int ret;
+
+	ret = scx_bpf_create_dsq(FALLBACK_DSQ, -1);
+	if (ret) {
+		scx_bpf_error("failed to create DSQ %d (%d)", FALLBACK_DSQ, ret);
+		return ret;
+	}
+
+	return 0;
 }
 
 void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
diff --git a/tools/sched_ext/scx_sdt.bpf.c b/tools/sched_ext/scx_sdt.bpf.c
index 48ea18614e28..d965f7d209de 100644
--- a/tools/sched_ext/scx_sdt.bpf.c
+++ b/tools/sched_ext/scx_sdt.bpf.c
@@ -691,7 +691,13 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
 		return ret;
 	}
 
-	return scx_bpf_create_dsq(SHARED_DSQ, -1);
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret) {
+		scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
+		return ret;
+	}
+
+	return 0;
 }
 
 void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index e6de99dba7db..b456bd7cae77 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -131,7 +131,15 @@ void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
 {
-	return scx_bpf_create_dsq(SHARED_DSQ, -1);
+	int ret;
+
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret) {
+		scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
+		return ret;
+	}
+
+	return 0;
 }
 
 void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)

From 2e06d54ea9a25e2925a31eb5410af0f16baa8f19 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Sun, 1 Feb 2026 19:42:08 -0500
Subject: [PATCH 7/8] tools/sched_ext: Fix data header access during free in
 scx_sdt

Fix a pointer arithmetic error in scx_sdt during freeing that
causes the allocator to use the wrong memory address for the
allocation's data header.

Fixes: 36929ebd17ae ("tools/sched_ext: add arena based scheduler")
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_sdt.bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/sched_ext/scx_sdt.bpf.c b/tools/sched_ext/scx_sdt.bpf.c
index d965f7d209de..31b09958e8d5 100644
--- a/tools/sched_ext/scx_sdt.bpf.c
+++ b/tools/sched_ext/scx_sdt.bpf.c
@@ -312,7 +312,7 @@ int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx)
 	pos = idx & mask;
 	data = chunk->data[pos];
 	if (likely(data)) {
-		data[pos] = (struct sdt_data) {
+		*data = (struct sdt_data) {
 			.tid.genn = data->tid.genn + 1,
 		};
 

From 4544e9c4ec9a5955a37fdd8204a3d98106f97ab7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 Feb 2026 09:40:22 -1000
Subject: [PATCH 8/8] selftests/sched_ext: Fix init_enable_count flakiness

The init_enable_count test is flaky. The test forks 1024 children before
attaching the scheduler to verify that existing tasks get ops.init_task()
called. The children were using sleep(1) before exiting.

7900aa699c34 ("sched_ext: Fix cgroup exit ordering by moving sched_ext_free()
to finish_task_switch()") changed when tasks are removed from scx_tasks -
previously when the task_struct was freed, now immediately in
finish_task_switch() when the task dies.

Before the commit, pre-forked children would linger on scx_tasks until freed
regardless of when they exited, so the scheduler would always see them during
iteration. The sleep(1) was unnecessary. After the commit, children are
removed as soon as they die. The sleep(1) masks the problem in most cases but
the test becomes flaky depending on timing.

Fix by synchronizing properly using a pipe. All children block on read() and
the parent signals them to exit by closing the write end after attaching the
scheduler. The children are auto-reaped so there's no need to wait on them.

Reported-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Cc: David Vernet <void@manifault.com>
Cc: Andrea Righi <arighi@nvidia.com>
Cc: Changwoo Min <changwoo@igalia.com>
Cc: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 .../selftests/sched_ext/init_enable_count.c   | 34 +++++++++++++------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c
index eddf9e0e26e7..82c71653977b 100644
--- a/tools/testing/selftests/sched_ext/init_enable_count.c
+++ b/tools/testing/selftests/sched_ext/init_enable_count.c
@@ -4,6 +4,7 @@
  * Copyright (c) 2023 David Vernet <dvernet@meta.com>
  * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
  */
+#include <signal.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <sched.h>
@@ -23,6 +24,9 @@ static enum scx_test_status run_test(bool global)
 	int ret, i, status;
 	struct sched_param param = {};
 	pid_t pids[num_pre_forks];
+	int pipe_fds[2];
+
+	SCX_FAIL_IF(pipe(pipe_fds) < 0, "Failed to create pipe");
 
 	skel = init_enable_count__open();
 	SCX_FAIL_IF(!skel, "Failed to open");
@@ -38,26 +42,34 @@ static enum scx_test_status run_test(bool global)
 	 * ensure (at least in practical terms) that there are more tasks that
 	 * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that
 	 * take the fork() path either below or in other processes.
+	 *
+	 * All children will block on read() on the pipe until the parent closes
+	 * the write end after attaching the scheduler, which signals all of
+	 * them to exit simultaneously. Auto-reap so we don't have to wait on
+	 * them.
 	 */
+	signal(SIGCHLD, SIG_IGN);
 	for (i = 0; i < num_pre_forks; i++) {
-		pids[i] = fork();
-		SCX_FAIL_IF(pids[i] < 0, "Failed to fork child");
-		if (pids[i] == 0) {
-			sleep(1);
+		pid_t pid = fork();
+
+		SCX_FAIL_IF(pid < 0, "Failed to fork child");
+		if (pid == 0) {
+			char buf;
+
+			close(pipe_fds[1]);
+			read(pipe_fds[0], &buf, 1);
+			close(pipe_fds[0]);
 			exit(0);
 		}
 	}
+	close(pipe_fds[0]);
 
 	link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops);
 	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
 
-	for (i = 0; i < num_pre_forks; i++) {
-		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
-			    "Failed to wait for pre-forked child\n");
-
-		SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i,
-			    status);
-	}
+	/* Signal all pre-forked children to exit. */
+	close(pipe_fds[1]);
+	signal(SIGCHLD, SIG_DFL);
 
 	bpf_link__destroy(link);
 	SCX_GE(skel->bss->init_task_cnt, num_pre_forks);