From 4616a9c36be7e2e051ef53b0e8fd729da0277abf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2026 11:05:31 -1000
Subject: [PATCH 01/18] sched_ext: Move scx_error() out of scx_link_sched()'s
 lock region

scx_link_sched() holds scx_sched_lock. The scx_error() calls inside take the
same lock through scx_claim_exit() and deadlock. Move them out of the guard.

Fixes: 6b4576b09714 ("sched_ext: Reject sub-sched attachment to a disabled parent")
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3f0d8aeaed81..7d367c140a36 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5585,10 +5585,12 @@ static void refresh_watchdog(void)
 
 static s32 scx_link_sched(struct scx_sched *sch)
 {
+	const char *err_msg;
+	s32 ret = 0;
+
 	scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
 #ifdef CONFIG_EXT_SUB_SCHED
 		struct scx_sched *parent = scx_parent(sch);
-		s32 ret;
 
 		if (parent) {
 			/*
@@ -5598,15 +5600,16 @@ static s32 scx_link_sched(struct scx_sched *sch)
 			 * parent can shoot us down.
 			 */
 			if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) {
-				scx_error(sch, "parent disabled");
-				return -ENOENT;
+				err_msg = "parent disabled";
+				ret = -ENOENT;
+				break;
 			}
 
 			ret = rhashtable_lookup_insert_fast(&scx_sched_hash,
 					&sch->hash_node, scx_sched_hash_params);
 			if (ret) {
-				scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret);
-				return ret;
+				err_msg = "failed to insert into scx_sched_hash";
+				break;
 			}
 
 			list_add_tail(&sch->sibling, &parent->children);
@@ -5616,6 +5619,15 @@ static s32 scx_link_sched(struct scx_sched *sch)
 		list_add_tail_rcu(&sch->all, &scx_sched_all);
 	}
 
+	/*
+	 * scx_error() takes scx_sched_lock via scx_claim_exit(), so it must run after
+	 * the guard above is released.
+	 */
+	if (ret) {
+		scx_error(sch, "%s (%d)", err_msg, ret);
+		return ret;
+	}
+
 	refresh_watchdog();
 	return 0;
 }

From 363a53749cc483409498e8f6e1525fe081f1d9d2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2026 12:09:21 -1000
Subject: [PATCH 02/18] sched_ext: Drop unused scx_find_sub_sched() stub

scx_find_sub_sched()'s only caller, scx_bpf_sub_dispatch(), is gated on
CONFIG_EXT_SUB_SCHED. When CONFIG_EXT_SUB_SCHED=n the caller compiles out
and the stub becomes dead code, tripping -Wunused-function on randconfigs.
Drop the stub.

Fixes: 25037af712eb ("sched_ext: Add rhashtable lookup for sub-schedulers")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/all/202605080556.42PXw8U9-lkp@intel.com/
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7d367c140a36..48b4834c7027 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -297,7 +297,6 @@ static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch)
 #else	/* CONFIG_EXT_SUB_SCHED */
 static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; }
 static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
-static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { return NULL; }
 static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {}
 #endif	/* CONFIG_EXT_SUB_SCHED */
 

From 1f91d0d5827512816789f74f4d72d16269bde1ec Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2026 14:16:59 -1000
Subject: [PATCH 03/18] sched_ext: Fix !CONFIG_EXT_SUB_SCHED build warnings

W=1 with CONFIG_EXT_SUB_SCHED=n flags 'err_msg' uninitialized and
'err_free_lb_resched' unused. Initialize err_msg and gate the label.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 48b4834c7027..f4e2db8e56be 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5584,7 +5584,7 @@ static void refresh_watchdog(void)
 
 static s32 scx_link_sched(struct scx_sched *sch)
 {
-	const char *err_msg;
+	const char *err_msg = "";
 	s32 ret = 0;
 
 	scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
@@ -6652,8 +6652,10 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 #endif	/* CONFIG_EXT_SUB_SCHED */
 	return sch;
 
+#ifdef CONFIG_EXT_SUB_SCHED
 err_free_lb_resched:
 	free_cpumask_var(sch->bypass_lb_resched_cpumask);
+#endif
 err_free_lb_cpumask:
 	free_cpumask_var(sch->bypass_lb_donee_cpumask);
 err_stop_helper:

From ab28a0673daabe7f0fcbd7a5e36334f2f003f02f Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang@linux.dev>
Date: Fri, 8 May 2026 19:50:45 +0800
Subject: [PATCH 04/18] sched_ext: Use IRQ_WORK_INIT_HARD() to initialize
 sch->disable_irq_work

For built with PREEMPT_RT kernels, the scx_disable_irq_workfn() is
called from per-cpu irq_work kthreads context, this means that
when call the scx_dump_state() in the scx_disable_irq_workfn() to
output current->comm/pid, it always output current irq_work kthread's
comm/pid. this commit therefore use the IRQ_WORK_INIT_HARD() to
initialize sch->disable_irq_work to make scx_disable_irq_workfn() is
called from hardirq context.

Fixes: f4a6c506d118 ("sched_ext: Always bounce scx_disable() through irq_work")
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f4e2db8e56be..df305712a2d4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6589,7 +6589,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 
 	sch->slice_dfl = SCX_SLICE_DFL;
 	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
-	init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn);
+	sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn);
 	kthread_init_work(&sch->disable_work, scx_disable_workfn);
 	timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0);
 

From 6947bea4b79115f50138882512f85fa9c93b2827 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 10 May 2026 10:08:16 -1000
Subject: [PATCH 05/18] sched_ext: Cleanups in preparation for the
 SCX_TASK_INIT_BEGIN/DEAD work

Cleanups in preparation for the state-machine work that follows:

- Convert three sub-sched call sites that open-code
  rcu_assign_pointer(p->scx.sched, ...) to scx_set_task_sched().

- Move scx_get_task_state()/scx_set_task_state() above the SCX task iter
  section so scx_task_iter_next_locked() can use them without a forward
  declaration.

No functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 76 +++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index df305712a2d4..10c6e0261f11 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -711,6 +711,41 @@ struct bpf_iter_scx_dsq {
 } __attribute__((aligned(8)));
 
 
+static u32 scx_get_task_state(const struct task_struct *p)
+{
+	return p->scx.flags & SCX_TASK_STATE_MASK;
+}
+
+static void scx_set_task_state(struct task_struct *p, u32 state)
+{
+	u32 prev_state = scx_get_task_state(p);
+	bool warn = false;
+
+	switch (state) {
+	case SCX_TASK_NONE:
+		break;
+	case SCX_TASK_INIT:
+		warn = prev_state != SCX_TASK_NONE;
+		break;
+	case SCX_TASK_READY:
+		warn = prev_state == SCX_TASK_NONE;
+		break;
+	case SCX_TASK_ENABLED:
+		warn = prev_state != SCX_TASK_READY;
+		break;
+	default:
+		WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
+			  prev_state, state, p->comm, p->pid);
+		return;
+	}
+
+	WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]",
+		  prev_state, state, p->comm, p->pid);
+
+	p->scx.flags &= ~SCX_TASK_STATE_MASK;
+	p->scx.flags |= state;
+}
+
 /*
  * SCX task iterator.
  */
@@ -3499,41 +3534,6 @@ static struct cgroup *tg_cgrp(struct task_group *tg)
 
 #endif	/* CONFIG_EXT_GROUP_SCHED */
 
-static u32 scx_get_task_state(const struct task_struct *p)
-{
-	return p->scx.flags & SCX_TASK_STATE_MASK;
-}
-
-static void scx_set_task_state(struct task_struct *p, u32 state)
-{
-	u32 prev_state = scx_get_task_state(p);
-	bool warn = false;
-
-	switch (state) {
-	case SCX_TASK_NONE:
-		break;
-	case SCX_TASK_INIT:
-		warn = prev_state != SCX_TASK_NONE;
-		break;
-	case SCX_TASK_READY:
-		warn = prev_state == SCX_TASK_NONE;
-		break;
-	case SCX_TASK_ENABLED:
-		warn = prev_state != SCX_TASK_READY;
-		break;
-	default:
-		WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
-			  prev_state, state, p->comm, p->pid);
-		return;
-	}
-
-	WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]",
-		  prev_state, state, p->comm, p->pid);
-
-	p->scx.flags &= ~SCX_TASK_STATE_MASK;
-	p->scx.flags |= state;
-}
-
 static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork)
 {
 	int ret;
@@ -5696,7 +5696,7 @@ static void scx_fail_parent(struct scx_sched *sch,
 
 		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
 			scx_disable_and_exit_task(sch, p);
-			rcu_assign_pointer(p->scx.sched, parent);
+			scx_set_task_sched(p, parent);
 		}
 	}
 	scx_task_iter_stop(&sti);
@@ -5783,7 +5783,7 @@ static void scx_sub_disable(struct scx_sched *sch)
 			 */
 			scx_disable_and_exit_task(sch, p);
 			scx_set_task_state(p, SCX_TASK_INIT);
-			rcu_assign_pointer(p->scx.sched, parent);
+			scx_set_task_sched(p, parent);
 			scx_set_task_state(p, SCX_TASK_READY);
 			scx_enable_task(parent, p);
 		}
@@ -7212,7 +7212,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 			 * $p is now only initialized for @sch and READY, which
 			 * is what we want. Assign it to @sch and enable.
 			 */
-			rcu_assign_pointer(p->scx.sched, sch);
+			scx_set_task_sched(p, sch);
 			scx_enable_task(sch, p);
 
 			p->scx.flags &= ~SCX_TASK_SUB_INIT;

From 938dd9ab2bd7df0a7e58ce4249794156be9530b4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 10 May 2026 10:08:16 -1000
Subject: [PATCH 06/18] sched_ext: Inline scx_init_task() and move
 RESET_RUNNABLE_AT into scx_set_task_state()

Prepare for the SCX_TASK_INIT_BEGIN/DEAD work that follows by collapsing the
scx_init_task() helper. Move the SCX_TASK_RESET_RUNNABLE_AT setting into
scx_set_task_state() on the INIT transition (it was set unconditionally at
every INIT site through the scx_init_task() helper), inline scx_init_task()
into scx_fork() and scx_root_enable_workfn(), and drop the helper.

As a side effect, scx_sub_disable() migration sequence now also sets
RESET_RUNNABLE_AT (it previously wrote INIT directly without going through
scx_init_task()). The flag triggers a runnable_at reset on the next
set_task_runnable(), which is harmless on a task that has just been moved
between scheds.

On root-enable, p->scx.flags is written without the task's rq lock. The task
isn't visible to scx yet, and a follow-up patch restores the lock-held
write.

v2: Note p->scx.flags rq-lock relaxation on root-enable path. (Andrea)

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 10c6e0261f11..81841277a54f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -726,6 +726,7 @@ static void scx_set_task_state(struct task_struct *p, u32 state)
 		break;
 	case SCX_TASK_INIT:
 		warn = prev_state != SCX_TASK_NONE;
+		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
 		break;
 	case SCX_TASK_READY:
 		warn = prev_state == SCX_TASK_NONE;
@@ -3585,22 +3586,6 @@ static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fo
 	return 0;
 }
 
-static int scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork)
-{
-	int ret;
-
-	ret = __scx_init_task(sch, p, fork);
-	if (!ret) {
-		/*
-		 * While @p's rq is not locked. @p is not visible to the rest of
-		 * SCX yet and it's safe to update the flags and state.
-		 */
-		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
-		scx_set_task_state(p, SCX_TASK_INIT);
-	}
-	return ret;
-}
-
 static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
@@ -3763,10 +3748,11 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 #else
 		struct scx_sched *sch = scx_root;
 #endif
-		ret = scx_init_task(sch, p, true);
-		if (!ret)
-			scx_set_task_sched(p, sch);
-		return ret;
+		ret = __scx_init_task(sch, p, true);
+		if (unlikely(ret))
+			return ret;
+		scx_set_task_state(p, SCX_TASK_INIT);
+		scx_set_task_sched(p, sch);
 	}
 
 	return 0;
@@ -6897,8 +6883,8 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 
 		scx_task_iter_unlock(&sti);
 
-		ret = scx_init_task(sch, p, false);
-		if (ret) {
+		ret = __scx_init_task(sch, p, false);
+		if (unlikely(ret)) {
 			put_task_struct(p);
 			scx_task_iter_stop(&sti);
 			scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
@@ -6906,6 +6892,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 			goto err_disable_unlock_all;
 		}
 
+		scx_set_task_state(p, SCX_TASK_INIT);
 		scx_set_task_sched(p, sch);
 		scx_set_task_state(p, SCX_TASK_READY);
 

From cceb8fa9cb2cf98e31d81ecf6353b6ba5ac57744 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 10 May 2026 10:08:16 -1000
Subject: [PATCH 07/18] sched_ext: Replace SCX_TASK_OFF_TASKS flag with
 SCX_TASK_DEAD state

SCX_TASK_OFF_TASKS marked tasks already through sched_ext_dead() so cgroup
task iteration would skip them. This can be expressed better with a task
state. Replace the flag with SCX_TASK_DEAD.

scx_disable_and_exit_task() resets state to NONE on its way out, so
sched_ext_dead() now sets DEAD after the wrapper returns. The validation
matrix grows NONE -> DEAD, warns on DEAD -> NONE, and tightens READY's
predecessor to INIT or ENABLED so the new DEAD value cannot silently
transition to READY.

Prepares for the following enable vs dead race fix.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/ext.h |  9 +++++----
 kernel/sched/ext.c        | 17 +++++++++++------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index adb9a4de068a..9f1a326ad03e 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -101,24 +101,25 @@ enum scx_ent_flags {
 	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
 	SCX_TASK_SUB_INIT	= 1 << 4, /* task being initialized for a sub sched */
 	SCX_TASK_IMMED		= 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */
-	SCX_TASK_OFF_TASKS	= 1 << 6, /* removed from scx_tasks by sched_ext_dead() */
 
 	/*
-	 * Bits 8 and 9 are used to carry task state:
+	 * Bits 8 to 10 are used to carry task state:
 	 *
 	 * NONE		ops.init_task() not called yet
 	 * INIT		ops.init_task() succeeded, but task can be cancelled
 	 * READY	fully initialized, but not in sched_ext
 	 * ENABLED	fully initialized and in sched_ext
+	 * DEAD		terminal state set by sched_ext_dead()
 	 */
-	SCX_TASK_STATE_SHIFT	= 8,	  /* bits 8 and 9 are used to carry task state */
-	SCX_TASK_STATE_BITS	= 2,
+	SCX_TASK_STATE_SHIFT	= 8,
+	SCX_TASK_STATE_BITS	= 3,
 	SCX_TASK_STATE_MASK	= ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
 
 	SCX_TASK_NONE		= 0 << SCX_TASK_STATE_SHIFT,
 	SCX_TASK_INIT		= 1 << SCX_TASK_STATE_SHIFT,
 	SCX_TASK_READY		= 2 << SCX_TASK_STATE_SHIFT,
 	SCX_TASK_ENABLED	= 3 << SCX_TASK_STATE_SHIFT,
+	SCX_TASK_DEAD		= 4 << SCX_TASK_STATE_SHIFT,
 
 	/*
 	 * Bits 12 and 13 are used to carry reenqueue reason. In addition to
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 81841277a54f..2fc4a12711f9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -723,17 +723,22 @@ static void scx_set_task_state(struct task_struct *p, u32 state)
 
 	switch (state) {
 	case SCX_TASK_NONE:
+		warn = prev_state == SCX_TASK_DEAD;
 		break;
 	case SCX_TASK_INIT:
 		warn = prev_state != SCX_TASK_NONE;
 		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
 		break;
 	case SCX_TASK_READY:
-		warn = prev_state == SCX_TASK_NONE;
+		warn = !(prev_state == SCX_TASK_INIT ||
+			 prev_state == SCX_TASK_ENABLED);
 		break;
 	case SCX_TASK_ENABLED:
 		warn = prev_state != SCX_TASK_READY;
 		break;
+	case SCX_TASK_DEAD:
+		warn = prev_state != SCX_TASK_NONE;
+		break;
 	default:
 		WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
 			  prev_state, state, p->comm, p->pid);
@@ -972,11 +977,11 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
 		/*
 		 * cgroup_task_dead() removes the dead tasks from cset->tasks
 		 * after sched_ext_dead() and cgroup iteration may see tasks
-		 * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS
-		 * is set by sched_ext_dead() under @p's rq lock. Test it to
+		 * which already finished sched_ext_dead(). %SCX_TASK_DEAD is
+		 * set by sched_ext_dead() under @p's rq lock. Test it to
 		 * avoid visiting tasks which are already dead from SCX POV.
 		 */
-		if (p->scx.flags & SCX_TASK_OFF_TASKS) {
+		if (scx_get_task_state(p) == SCX_TASK_DEAD) {
 			__scx_task_iter_rq_unlock(iter);
 			continue;
 		}
@@ -3847,7 +3852,7 @@ void sched_ext_dead(struct task_struct *p)
 	 * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
 	 * ENABLED transitions can't race us. Disable ops for @p.
 	 *
-	 * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see
+	 * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see
 	 * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup
 	 * iteration is only used from sub-sched paths, which require root
 	 * enabled. Root enable transitions every live task to at least READY.
@@ -3858,7 +3863,7 @@ void sched_ext_dead(struct task_struct *p)
 
 		rq = task_rq_lock(p, &rf);
 		scx_disable_and_exit_task(scx_task_sched(p), p);
-		p->scx.flags |= SCX_TASK_OFF_TASKS;
+		scx_set_task_state(p, SCX_TASK_DEAD);
 		task_rq_unlock(rq, p, &rf);
 	}
 }

From c941d7391f258d5d06e0f7e962a52f99a547a83e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 10 May 2026 10:08:16 -1000
Subject: [PATCH 08/18] sched_ext: Close root-enable vs sched_ext_dead() race
 with SCX_TASK_INIT_BEGIN

scx_root_enable_workfn() drops the iter rq lock for ops.init_task() and a
TASK_DEAD @p can fall through sched_ext_dead() in that window. The race hits
when sched_ext_dead() observes SCX_TASK_INIT (the intermediate state before
@p->scx.sched is published) and dereferences NULL via SCX_HAS_OP(NULL,
exit_task), or observes SCX_TASK_NONE during the unlocked init window and
skips cleanup so exit_task() never runs.

Add SCX_TASK_INIT_BEGIN. The enable path writes NONE -> INIT_BEGIN under the
iter rq lock, then takes the rq lock again after init to walk INIT_BEGIN ->
INIT -> READY. sched_ext_dead() that wins the rq-lock race observes
INIT_BEGIN and sets DEAD without calling into ops; the post-init recheck
unwinds via scx_sub_init_cancel_task().

scx_fork() runs single-threaded against sched_ext_dead() (the task is not on
scx_tasks until scx_post_fork() adds it) so its INIT_BEGIN -> INIT walk
needs no rq-lock pairing; it rolls back to NONE on ops.init_task() failure.

The validation matrix grows the INIT_BEGIN row and the INIT_BEGIN -> DEAD
edge; INIT now requires INIT_BEGIN as the predecessor. scx_sub_disable()'s
migration writes INIT_BEGIN as a synthetic predecessor to satisfy the
tightened verification.

The sub-sched paths still race with sched_ext_dead() during the unlocked
init window. This will be fixed by the next patch.

Reported-by: zhidao su <suzhidao@xiaomi.com>
Link: https://lore.kernel.org/all/20260429133155.3825247-1-suzhidao@xiaomi.com/
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/ext.h | 10 ++++---
 kernel/sched/ext.c        | 56 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 9f1a326ad03e..2129e18ada58 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -106,6 +106,7 @@ enum scx_ent_flags {
 	 * Bits 8 to 10 are used to carry task state:
 	 *
 	 * NONE		ops.init_task() not called yet
+	 * INIT_BEGIN	ops.init_task() in flight; see sched_ext_dead()
 	 * INIT		ops.init_task() succeeded, but task can be cancelled
 	 * READY	fully initialized, but not in sched_ext
 	 * ENABLED	fully initialized and in sched_ext
@@ -116,10 +117,11 @@ enum scx_ent_flags {
 	SCX_TASK_STATE_MASK	= ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
 
 	SCX_TASK_NONE		= 0 << SCX_TASK_STATE_SHIFT,
-	SCX_TASK_INIT		= 1 << SCX_TASK_STATE_SHIFT,
-	SCX_TASK_READY		= 2 << SCX_TASK_STATE_SHIFT,
-	SCX_TASK_ENABLED	= 3 << SCX_TASK_STATE_SHIFT,
-	SCX_TASK_DEAD		= 4 << SCX_TASK_STATE_SHIFT,
+	SCX_TASK_INIT_BEGIN	= 1 << SCX_TASK_STATE_SHIFT,
+	SCX_TASK_INIT		= 2 << SCX_TASK_STATE_SHIFT,
+	SCX_TASK_READY		= 3 << SCX_TASK_STATE_SHIFT,
+	SCX_TASK_ENABLED	= 4 << SCX_TASK_STATE_SHIFT,
+	SCX_TASK_DEAD		= 5 << SCX_TASK_STATE_SHIFT,
 
 	/*
 	 * Bits 12 and 13 are used to carry reenqueue reason. In addition to
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2fc4a12711f9..29fa9ffe7c7b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -725,8 +725,11 @@ static void scx_set_task_state(struct task_struct *p, u32 state)
 	case SCX_TASK_NONE:
 		warn = prev_state == SCX_TASK_DEAD;
 		break;
-	case SCX_TASK_INIT:
+	case SCX_TASK_INIT_BEGIN:
 		warn = prev_state != SCX_TASK_NONE;
+		break;
+	case SCX_TASK_INIT:
+		warn = prev_state != SCX_TASK_INIT_BEGIN;
 		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
 		break;
 	case SCX_TASK_READY:
@@ -737,7 +740,8 @@ static void scx_set_task_state(struct task_struct *p, u32 state)
 		warn = prev_state != SCX_TASK_READY;
 		break;
 	case SCX_TASK_DEAD:
-		warn = prev_state != SCX_TASK_NONE;
+		warn = !(prev_state == SCX_TASK_NONE ||
+			 prev_state == SCX_TASK_INIT_BEGIN);
 		break;
 	default:
 		WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
@@ -3753,9 +3757,12 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 #else
 		struct scx_sched *sch = scx_root;
 #endif
+		scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
 		ret = __scx_init_task(sch, p, true);
-		if (unlikely(ret))
+		if (unlikely(ret)) {
+			scx_set_task_state(p, SCX_TASK_NONE);
 			return ret;
+		}
 		scx_set_task_state(p, SCX_TASK_INIT);
 		scx_set_task_sched(p, sch);
 	}
@@ -3856,13 +3863,18 @@ void sched_ext_dead(struct task_struct *p)
 	 * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup
 	 * iteration is only used from sub-sched paths, which require root
 	 * enabled. Root enable transitions every live task to at least READY.
+	 *
+	 * %INIT_BEGIN means ops.init_task() is running for @p. Don't call
+	 * into ops; transition to %DEAD so the post-init recheck unwinds
+	 * via scx_sub_init_cancel_task().
 	 */
 	if (scx_get_task_state(p) != SCX_TASK_NONE) {
 		struct rq_flags rf;
 		struct rq *rq;
 
 		rq = task_rq_lock(p, &rf);
-		scx_disable_and_exit_task(scx_task_sched(p), p);
+		if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN)
+			scx_disable_and_exit_task(scx_task_sched(p), p);
 		scx_set_task_state(p, SCX_TASK_DEAD);
 		task_rq_unlock(rq, p, &rf);
 	}
@@ -5773,6 +5785,7 @@ static void scx_sub_disable(struct scx_sched *sch)
 			 * $p having already been initialized, and then enable.
 			 */
 			scx_disable_and_exit_task(sch, p);
+			scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
 			scx_set_task_state(p, SCX_TASK_INIT);
 			scx_set_task_sched(p, parent);
 			scx_set_task_state(p, SCX_TASK_READY);
@@ -6878,6 +6891,9 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 
 	scx_task_iter_start(&sti, NULL);
 	while ((p = scx_task_iter_next_locked(&sti))) {
+		struct rq_flags rf;
+		struct rq *rq;
+
 		/*
 		 * @p may already be dead, have lost all its usages counts and
 		 * be waiting for RCU grace period before being freed. @p can't
@@ -6886,10 +6902,26 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 		if (!tryget_task_struct(p))
 			continue;
 
+		/*
+		 * Set %INIT_BEGIN under the iter's rq lock so that a concurrent
+		 * sched_ext_dead() does not call ops.exit_task() on @p while
+		 * ops.init_task() is running. If sched_ext_dead() runs before
+		 * this store, it has already removed @p from scx_tasks and the
+		 * iter won't visit @p; if it runs after, it observes
+		 * %INIT_BEGIN and transitions to %DEAD without calling ops,
+		 * leaving the post-init recheck below to unwind.
+		 */
+		scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
 		scx_task_iter_unlock(&sti);
 
 		ret = __scx_init_task(sch, p, false);
+
+		rq = task_rq_lock(p, &rf);
+
 		if (unlikely(ret)) {
+			if (scx_get_task_state(p) != SCX_TASK_DEAD)
+				scx_set_task_state(p, SCX_TASK_NONE);
+			task_rq_unlock(rq, p, &rf);
 			put_task_struct(p);
 			scx_task_iter_stop(&sti);
 			scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
@@ -6897,10 +6929,20 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 			goto err_disable_unlock_all;
 		}
 
-		scx_set_task_state(p, SCX_TASK_INIT);
-		scx_set_task_sched(p, sch);
-		scx_set_task_state(p, SCX_TASK_READY);
+		if (scx_get_task_state(p) == SCX_TASK_DEAD) {
+			/*
+			 * sched_ext_dead() observed %INIT_BEGIN and set %DEAD.
+			 * ops.exit_task() is owed to the sched __scx_init_task()
+			 * ran against; call it now.
+			 */
+			scx_sub_init_cancel_task(sch, p);
+		} else {
+			scx_set_task_state(p, SCX_TASK_INIT);
+			scx_set_task_sched(p, sch);
+			scx_set_task_state(p, SCX_TASK_READY);
+		}
 
+		task_rq_unlock(rq, p, &rf);
 		put_task_struct(p);
 	}
 	scx_task_iter_stop(&sti);

From cd6aab736702f981ac4d128e04a4e33105ea797d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 10 May 2026 10:08:16 -1000
Subject: [PATCH 09/18] sched_ext: Close sub-sched init race with post-init
 DEAD recheck

scx_sub_enable_workfn()'s init pass and scx_sub_disable() migration both
drop the rq lock to call __scx_init_task() against the other sched. A
TASK_DEAD @p can fall through sched_ext_dead() in that window.
sched_ext_dead() runs ops.exit_task() on the sched @p was attached to, not
on the sched whose init just completed, so the new allocation leaks.

Reuse the DEAD signal set by sched_ext_dead(). After __scx_init_task()
returns, take task_rq_lock(p) and check for DEAD; on hit, call
scx_sub_init_cancel_task() against the sub sched the init ran for and drop
@p; on miss, proceed as before.

Reported-by: zhidao su <suzhidao@xiaomi.com>
Link: https://lore.kernel.org/all/20260429133155.3825247-1-suzhidao@xiaomi.com/
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 29fa9ffe7c7b..6fbe3160eccd 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5777,6 +5777,21 @@ static void scx_sub_disable(struct scx_sched *sch)
 		}
 
 		rq = task_rq_lock(p, &rf);
+
+		if (scx_get_task_state(p) == SCX_TASK_DEAD) {
+			/*
+			 * sched_ext_dead() raced us between __scx_init_task()
+			 * and this rq lock and ran exit_task() on @sch (the
+			 * sched @p was on at that point), not on $parent.
+			 * $parent's just-completed init is owed an exit_task()
+			 * and we issue it here.
+			 */
+			scx_sub_init_cancel_task(parent, p);
+			task_rq_unlock(rq, p, &rf);
+			put_task_struct(p);
+			continue;
+		}
+
 		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
 			/*
 			 * $p is initialized for $parent and still attached to
@@ -5791,8 +5806,8 @@ static void scx_sub_disable(struct scx_sched *sch)
 			scx_set_task_state(p, SCX_TASK_READY);
 			scx_enable_task(parent, p);
 		}
-		task_rq_unlock(rq, p, &rf);
 
+		task_rq_unlock(rq, p, &rf);
 		put_task_struct(p);
 	}
 	scx_task_iter_stop(&sti);
@@ -7212,6 +7227,21 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 			goto abort;
 
 		rq = task_rq_lock(p, &rf);
+
+		if (scx_get_task_state(p) == SCX_TASK_DEAD) {
+			/*
+			 * sched_ext_dead() raced us between __scx_init_task()
+			 * and this rq lock and ran exit_task() on $parent (the
+			 * sched @p was on at that point), not on @sch. @sch's
+			 * just-completed init is owed an exit_task() and we
+			 * issue it here.
+			 */
+			scx_sub_init_cancel_task(sch, p);
+			task_rq_unlock(rq, p, &rf);
+			put_task_struct(p);
+			continue;
+		}
+
 		p->scx.flags |= SCX_TASK_SUB_INIT;
 		task_rq_unlock(rq, p, &rf);
 

From d3e73a0808ddfb91ac36cd548643cbbeb00ad4db Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 10 May 2026 10:08:16 -1000
Subject: [PATCH 10/18] sched_ext: Handle SCX_TASK_NONE in
 disable/switched_from paths

scx_fail_parent() leaves cgroup tasks at (state=NONE, sched=parent,
sched_class=ext) until the parent itself is torn down by the scx_error() it
raised. When the later root_disable iterates them, two paths trip on NONE.

scx_disable_and_exit_task() re-enters the wrapper at NONE: the inner switch
returns early but the trailing scx_set_task_sched(p, NULL) clobbers the
parent sched left by scx_fail_parent(), and scx_set_task_state(p, NONE)
wastes a write on an already-NONE task. switched_from_scx() then calls
scx_disable_task(), which WARNs on non-ENABLED state and writes state=READY,
producing a NONE -> READY transition the validation matrix rejects.

Treat NONE as "nothing to do" in both paths. Add a NONE early-return at the
top of scx_disable_and_exit_task() and a parallel NONE check in
switched_from_scx() next to task_dead_and_done().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6fbe3160eccd..4efe0099f79a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3703,6 +3703,15 @@ static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *
 static void scx_disable_and_exit_task(struct scx_sched *sch,
 				      struct task_struct *p)
 {
+	/*
+	 * %NONE means @p is already detached at the SCX level (e.g. handed
+	 * back to the parent by scx_fail_parent() with no init to undo).
+	 * Skip to avoid clobbering scx_task_sched() and writing %NONE again
+	 * on a state that's already %NONE.
+	 */
+	if (scx_get_task_state(p) == SCX_TASK_NONE)
+		return;
+
 	__scx_disable_and_exit_task(sch, p);
 
 	/*
@@ -3921,6 +3930,16 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
 	if (task_dead_and_done(p))
 		return;
 
+	/*
+	 * %NONE means SCX is no longer tracking @p at the task level (e.g.
+	 * scx_fail_parent() handed @p back to the parent at NONE pending the
+	 * parent's own teardown). There is nothing to disable; calling
+	 * scx_disable_task() would WARN on the non-%ENABLED state and trigger a
+	 * NONE -> READY validation failure.
+	 */
+	if (scx_get_task_state(p) == SCX_TASK_NONE)
+		return;
+
 	scx_disable_task(scx_task_sched(p), p);
 }
 

From 3788e32516530dee66cf9186f846480a16799b05 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Sun, 10 May 2026 19:52:11 +0200
Subject: [PATCH 11/18] selftests/sched_ext: Fix build error in dequeue
 selftest

Building the dequeue selftest with newer compilers (e.g., gcc 16)
triggers the following error:

 dequeue.c:28:22: error: variable 'sum' set but not used

The 'volatile' qualifier prevents the writes from being optimized away,
but does not silence the unused variable 'sum' is indeed only written
and never read.

Consume 'sum' via an empty asm() with a register input constraint. This
forces the compiler to keep the accumulated value (preserving the CPU
stress loop) and avoiding the build error.

Fixes: 658ad2259b3e ("selftests/sched_ext: Add test to validate ops.dequeue() semantics")
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/dequeue.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c
index 4e93262703ca..383d06e972a4 100644
--- a/tools/testing/selftests/sched_ext/dequeue.c
+++ b/tools/testing/selftests/sched_ext/dequeue.c
@@ -33,6 +33,7 @@ static void worker_fn(int id)
 		/* Do some work to trigger scheduling events */
 		for (j = 0; j < 10000; j++)
 			sum += j;
+		asm volatile("" : : "r"(sum));
 
 		/* Sleep to trigger dequeue */
 		usleep(1000 + (id * 100));

From bbf30b383cf6e87f2fe57c292fbd640b1d88b4c3 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Mon, 11 May 2026 08:18:12 +0200
Subject: [PATCH 12/18] sched_ext: Fix ops->priv clobber on concurrent
 attach/detach

Under heavy concurrent attach/detach operations, scx_claim_exit() can
trigger a NULL pointer dereference. This can be reproduced running the
reload_loop kselftests inside a virtme-ng session:

 $ vng -v -- ./tools/testing/selftests/sched_ext/runner -t reload_loop
 ...
 BUG: kernel NULL pointer dereference, address: 0000000000000400
 RIP: 0010:scx_claim_exit+0x3b/0x120
 Call Trace:
  <TASK>
  bpf_scx_unreg+0x45/0xb0
  bpf_struct_ops_map_link_dealloc+0x39/0x50
  bpf_link_release+0x18/0x20
  __fput+0x10b/0x2e0
  __x64_sys_close+0x47/0xa0

The underlying race (diagnosed by Tejun Heo) is a stomp of @ops->priv,
not a missing NULL check:

  T2 unreg(K)                       T1 reg(K)
  -----------                       ---------
  sch = ops->priv = sch_b800
  scx_disable; flush_disable_work
    [scx_root_disable: scx_root=NULL,
     mutex_unlock, state=DISABLED]
                                    mutex_lock; state ok
                                    scx_alloc_and_add_sched:
                                      ops->priv = sch_a800
                                    scx_root = sch_a800; init=0
                                    state=ENABLED; mutex_unlock
    [flush returns]
  RCU_INIT_POINTER(ops->priv, NULL) <-- clobbers sch_a800
  kobject_put(sch_b800)

T1 acquires scx_enable_mutex inside scx_root_disable()'s mutex_unlock
window and starts a fresh attach on the same kdata, assigning sch_a800
to @ops->priv. T2 then continues out of scx_disable()/flush_disable_work
and clobbers @ops->priv to NULL, leaking sch_a800; the bpf_link is gone
but state stays SCX_ENABLED, so all future attaches fail with -EBUSY
permanently. The next bpf_scx_unreg() on that kdata then reads NULL
@ops->priv and dereferences it in scx_claim_exit().

Make @ops->priv the lifecycle binding: in scx_root_enable_workfn() and
scx_sub_enable_workfn(), after the existing state check and still under
scx_enable_mutex, refuse with -EBUSY if @ops->priv is non-NULL. This
rejects an attempt to reuse a kdata that is still bound to a previous
scheduler instance, closing the race without changing the unreg side.

Fixes: 105dcd005be2 ("sched_ext: Introduce scx_prog_sched()")
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4efe0099f79a..8e06694094d7 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6803,6 +6803,19 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 		goto err_unlock;
 	}
 
+	/*
+	 * @ops->priv binds @ops to its scx_sched instance. It is set here by
+	 * scx_alloc_and_add_sched() and cleared at the tail of bpf_scx_unreg(),
+	 * which runs after scx_root_disable() has dropped scx_enable_mutex. If
+	 * it's still non-NULL here, a previous attachment on @ops has not
+	 * finished tearing down; proceeding would let the in-flight unreg's
+	 * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign.
+	 */
+	if (rcu_access_pointer(ops->priv)) {
+		ret = -EBUSY;
+		goto err_unlock;
+	}
+
 	ret = alloc_kick_syncs();
 	if (ret)
 		goto err_unlock;
@@ -7120,6 +7133,12 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 		goto out_unlock;
 	}
 
+	/* See scx_root_enable_workfn() for the @ops->priv check. */
+	if (rcu_access_pointer(ops->priv)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
 	cgrp = cgroup_get_from_id(ops->sub_cgroup_id);
 	if (IS_ERR(cgrp)) {
 		ret = PTR_ERR(cgrp);

From 86ecb1c1a1f5c1bf4a45b91f54f8220c3121bd3b Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Mon, 11 May 2026 10:31:30 +0200
Subject: [PATCH 13/18] sched_ext: Clear ops->priv on scx_alloc_and_add_sched()
 error paths

scx_alloc_and_add_sched() can fail after @sch has been assigned to
ops->priv. In those cases @sch is torn down (either via kfree() through
the err_free_* chain or via kobject_put() -> scx_kobj_release() -> RCU
work), but @ops->priv is left pointing at the about-to-be-freed pointer.

With the recent -EBUSY gate in scx_root_enable_workfn() and
scx_sub_enable_workfn() that rejects an attach when @ops->priv is still
non-NULL, see commit bbf30b383cf6 ("sched_ext: Fix ops->priv clobber on
concurrent attach/detach"), a dangling @ops->priv permanently locks the
kdata out: every future attach attempt sees a stale binding and returns
-EBUSY even though no scheduler is actually attached.

Clear @ops->priv on the post-assign failure paths so that the kdata
returns to its pre-attach state when the function returns ERR_PTR().

Fixes: bbf30b383cf6 ("sched_ext: Fix ops->priv clobber on concurrent attach/detach")
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8e06694094d7..1efd5d82b08b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6670,6 +6670,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 		ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
 
 	if (ret < 0) {
+		RCU_INIT_POINTER(ops->priv, NULL);
 		kobject_put(&sch->kobj);
 		return ERR_PTR(ret);
 	}
@@ -6677,6 +6678,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	if (ops->sub_attach) {
 		sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj);
 		if (!sch->sub_kset) {
+			RCU_INIT_POINTER(ops->priv, NULL);
 			kobject_put(&sch->kobj);
 			return ERR_PTR(-ENOMEM);
 		}
@@ -6684,6 +6686,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 #else	/* CONFIG_EXT_SUB_SCHED */
 	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
 	if (ret < 0) {
+		RCU_INIT_POINTER(ops->priv, NULL);
 		kobject_put(&sch->kobj);
 		return ERR_PTR(ret);
 	}
@@ -6692,6 +6695,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 
 #ifdef CONFIG_EXT_SUB_SCHED
 err_free_lb_resched:
+	RCU_INIT_POINTER(ops->priv, NULL);
 	free_cpumask_var(sch->bypass_lb_resched_cpumask);
 #endif
 err_free_lb_cpumask:

From 9a415cc53711f2238e0f0ca8a6bcc796c003b127 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 11 May 2026 12:05:48 -1000
Subject: [PATCH 14/18] sched_ext: Avoid UAF in scx_root_enable_workfn() init
 failure path

In scx_root_enable_workfn(), put_task_struct(p) is called before scx_error()
dereferences p->comm and p->pid. If the iterator's reference is the last
drop, the task is freed synchronously and the deref becomes a UAF.

Move put_task_struct() past scx_error().

Reported-by: Sashiko <sashiko-bot@kernel.org>
Closes: https://lore.kernel.org/all/20260511214031.AF5E9C2BCB0@smtp.kernel.org/
Fixes: f0e1a0643a59 ("sched_ext: Implement BPF extensible scheduler class")
Cc: stable@vger.kernel.org # v6.12+
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1efd5d82b08b..9354da79e162 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6973,10 +6973,10 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 			if (scx_get_task_state(p) != SCX_TASK_DEAD)
 				scx_set_task_state(p, SCX_TASK_NONE);
 			task_rq_unlock(rq, p, &rf);
-			put_task_struct(p);
 			scx_task_iter_stop(&sti);
 			scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
 				  ret, p->comm, p->pid);
+			put_task_struct(p);
 			goto err_disable_unlock_all;
 		}
 

From 39e25a2100604320e8d9df54c6c31258f7a3df29 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 May 2026 10:30:00 -1000
Subject: [PATCH 15/18] sched_ext: Drop NONE early return in
 scx_disable_and_exit_task()

d3e73a0808dd ("sched_ext: Handle SCX_TASK_NONE in disable/switched_from
paths") skipped the trailing scx_set_task_sched(p, NULL) on NONE tasks.
After scx_fail_parent() parks a task at NONE/sched=parent and the parent
is later freed via queue_rcu_work() during root_disable, the preserved
p->scx.sched dangles - print_scx_info() from sched_show_task() reads
sch->ops.name from freed memory.

Drop the early return. __scx_disable_and_exit_task() already short-
circuits on NONE and the SUB_INIT block was cleared by
scx_fail_parent()'s earlier call, so clearing p->scx.sched is the only
work left - and the one thing the path actually needs.

v2: Extend the SUB_INIT block comment to note that the flag is only
    set on the sub-enable path, so it's always clear on the NONE
    re-entry (Andrea).

Fixes: d3e73a0808dd ("sched_ext: Handle SCX_TASK_NONE in disable/switched_from paths")
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9354da79e162..68120f679178 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3703,22 +3703,14 @@ static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *
 static void scx_disable_and_exit_task(struct scx_sched *sch,
 				      struct task_struct *p)
 {
-	/*
-	 * %NONE means @p is already detached at the SCX level (e.g. handed
-	 * back to the parent by scx_fail_parent() with no init to undo).
-	 * Skip to avoid clobbering scx_task_sched() and writing %NONE again
-	 * on a state that's already %NONE.
-	 */
-	if (scx_get_task_state(p) == SCX_TASK_NONE)
-		return;
-
 	__scx_disable_and_exit_task(sch, p);
 
 	/*
 	 * If set, @p exited between __scx_init_task() and scx_enable_task() in
 	 * scx_sub_enable() and is initialized for both the associated sched and
 	 * its parent. Exit for the child too - scx_enable_task() never ran for
-	 * it, so undo only init_task.
+	 * it, so undo only init_task. The flag is only set on the sub-enable
+	 * path, so it's always clear when @p arrives here in %SCX_TASK_NONE.
 	 */
 	if (p->scx.flags & SCX_TASK_SUB_INIT) {
 		if (!WARN_ON_ONCE(!scx_enabling_sub_sched))

From b273b75b8d677aea06dd06d80b61b3bb06e94680 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 11 May 2026 13:18:19 -1000
Subject: [PATCH 16/18] sched_ext: INIT_LIST_HEAD() &sch->all in
 scx_alloc_and_add_sched()

On scx_link_sched() error paths (parent disabled, hash insert failure),
&sch->all is never added to scx_sched_all. The cleanup path runs
scx_unlink_sched() unconditionally, which calls list_del_rcu(&sch->all) on a
list_head that was never initialized triggering a corruption warning.

Initialize &sch->all.

Fixes: 54be8de4236a ("sched_ext: Factor out scx_link_sched() and scx_unlink_sched()")
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 68120f679178..6d69ba29cfd7 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6635,6 +6635,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	rcu_assign_pointer(ops->priv, sch);
 
 	sch->kobj.kset = scx_kset;
+	INIT_LIST_HEAD(&sch->all);
 
 #ifdef CONFIG_EXT_SUB_SCHED
 	char *buf = kzalloc(PATH_MAX, GFP_KERNEL);

From cceb874eee46fe4b3d3c6c496f19125d9a3a9a8f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 11 May 2026 13:18:23 -1000
Subject: [PATCH 17/18] sched_ext: Defer sub_kset base put to
 scx_sched_free_rcu_work

scx_sub_enable_workfn() pins parent->kobj before dropping scx_sched_lock,
but that does not pin parent->sub_kset. Concurrent disable can
kset_unregister and free sub_kset before scx_alloc_and_add_sched()
dereferences it.

Split sub_kset teardown: kobject_del() at disable keeps sysfs removal; defer
kobject_put() to scx_sched_free_rcu_work so the memory survives. A racing
child sees state_in_sysfs=0 with valid memory, sysfs_create_dir() fails, and
the existing exit_kind gate in scx_link_sched() turns it away with -ENOENT.

Fixes: 411d3ef1a705 ("sched_ext: Unregister sub_kset on scheduler disable")
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6d69ba29cfd7..23f7b3f63b09 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4821,6 +4821,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	kfree(sch->cgrp_path);
 	if (sch_cgroup(sch))
 		cgroup_put(sch_cgroup(sch));
+	if (sch->sub_kset)
+		kobject_put(&sch->sub_kset->kobj);
 #endif	/* CONFIG_EXT_SUB_SCHED */
 
 	for_each_possible_cpu(cpu) {
@@ -5861,7 +5863,7 @@ static void scx_sub_disable(struct scx_sched *sch)
 	if (sch->ops.exit)
 		SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
 	if (sch->sub_kset)
-		kset_unregister(sch->sub_kset);
+		kobject_del(&sch->sub_kset->kobj);
 	kobject_del(&sch->kobj);
 }
 #else	/* CONFIG_EXT_SUB_SCHED */
@@ -5995,7 +5997,7 @@ static void scx_root_disable(struct scx_sched *sch)
 	 */
 #ifdef CONFIG_EXT_SUB_SCHED
 	if (sch->sub_kset)
-		kset_unregister(sch->sub_kset);
+		kobject_del(&sch->sub_kset->kobj);
 #endif
 	kobject_del(&sch->kobj);
 

From 6ae315d37924435516d697ea7dde0b799a5928e0 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Wed, 13 May 2026 13:24:38 +0200
Subject: [PATCH 18/18] sched_ext: Use HK_TYPE_DOMAIN_BOOT to detect isolcpus=
 domain isolation

scx_enable() refuses to attach a BPF scheduler when isolcpus=domain is
in effect by comparing housekeeping_cpumask(HK_TYPE_DOMAIN) against
cpu_possible_mask.

Since commit 27c3a5967f05 ("sched/isolation: Convert housekeeping
cpumasks to rcu pointers"), HK_TYPE_DOMAIN's cpumask is RCU protected
and dereferencing it requires either RCU read lock, the cpu_hotplug
write lock, or the cpuset lock; scx_enable() holds none of these, so
booting with isolcpus=domain and attaching any BPF scheduler triggers
the following lockdep splat:

  =============================
  WARNING: suspicious RCU usage
  -----------------------------
  kernel/sched/isolation.c:60 suspicious rcu_dereference_check() usage!

  1 lock held by scx_flash/281:
   #0: ffffffff8379fce0 (update_mutex){+.+.}-{4:4}, at:
       bpf_struct_ops_link_create+0x134/0x1c0

  Call Trace:
   dump_stack_lvl+0x6f/0xb0
   lockdep_rcu_suspicious.cold+0x37/0x70
   housekeeping_cpumask+0xcd/0xe0
   scx_enable.isra.0+0x17/0x120
   bpf_scx_reg+0x5e/0x80
   bpf_struct_ops_link_create+0x151/0x1c0
   __sys_bpf+0x1e4b/0x33c0
   __x64_sys_bpf+0x21/0x30
   do_syscall_64+0x117/0xf80
   entry_SYSCALL_64_after_hwframe+0x77/0x7f

In addition, commit 03ff73510169 ("cpuset: Update HK_TYPE_DOMAIN cpumask
from cpuset") made HK_TYPE_DOMAIN include cpuset isolated partitions as
well, which means the current check also rejects BPF schedulers when a
cpuset partition is active. That contradicts the original intent of
commit 9f391f94a173 ("sched_ext: Disallow loading BPF scheduler if
isolcpus= domain isolation is in effect"), which explicitly noted that
cpuset partitions are honored through per-task cpumasks and should not
be rejected.

Switch to housekeeping_enabled(HK_TYPE_DOMAIN_BOOT), which reads only
the housekeeping flag bit (no RCU dereference) and reflects exactly the
boot-time isolcpus= configuration that the error message refers to.

Fixes: 27c3a5967f05 ("sched/isolation: Convert housekeeping cpumasks to rcu pointers")
Cc: stable@vger.kernel.org # v7.0+
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/sched/ext.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 23f7b3f63b09..a6d0a93d8174 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -7415,8 +7415,7 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	static DEFINE_MUTEX(helper_mutex);
 	struct scx_enable_cmd cmd;
 
-	if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
-			   cpu_possible_mask)) {
+	if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) {
 		pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
 		return -EINVAL;
 	}