sched/mmcid: Prevent CID stalls due to concurrent forks

A newly forked task is accounted as MMCID user before the task is visible in the process' thread list and the global task list. This creates the following problem: CPU1 CPU2 fork() sched_mm_cid_fork(tnew1) tnew1->mm.mm_cid_users++; tnew1->mm_cid.cid = getcid() -> preemption fork() sched_mm_cid_fork(tnew2) tnew2->mm.mm_cid_users++; // Reaches the per CPU threshold mm_cid_fixup_tasks_to_cpus() for_each_other(current, p) .... As tnew1 is not visible yet, this fails to fix up the already allocated CID of tnew1. As a consequence a subsequent schedule in might fail to acquire a (transitional) CID and the machine stalls. Move the invocation of sched_mm_cid_fork() after the new task becomes visible in the thread and the task list to prevent this. This also makes it symmetrical vs. exit() where the task is removed as CID user before the task is removed from the thread and task lists. Fixes: fbd0e71dc3 ("sched/mmcid: Provide CID ownership mode fixup functions") Signed-off-by: Thomas Gleixner <tglx@kernel.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20260310202525.969061974@kernel.org
2026-03-31 15:25:01 -04:00 · 2026-03-10 21:28:53 +01:00
parent 1f318b96cc
commit b2e48c429e
3 changed files with 15 additions and 11 deletions
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2354,7 +2354,6 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
 #ifdef CONFIG_SCHED_MM_CID
 void sched_mm_cid_before_execve(struct task_struct *t);
 void sched_mm_cid_after_execve(struct task_struct *t);
-void sched_mm_cid_fork(struct task_struct *t);
 void sched_mm_cid_exit(struct task_struct *t);
 static __always_inline int task_mm_cid(struct task_struct *t)
 {
@@ -2363,7 +2362,6 @@ static __always_inline int task_mm_cid(struct task_struct *t)
 #else
 static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
 static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_fork(struct task_struct *t) { }
 static inline void sched_mm_cid_exit(struct task_struct *t) { }
 static __always_inline int task_mm_cid(struct task_struct *t)
 {
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1586,7 +1586,6 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk)

 	tsk->mm = mm;
 	tsk->active_mm = mm;
-	sched_mm_cid_fork(tsk);
 	return 0;
 }

@@ -2498,7 +2497,6 @@ __latent_entropy struct task_struct *copy_process(
 	exit_nsproxy_namespaces(p);
 bad_fork_cleanup_mm:
 	if (p->mm) {
-		sched_mm_cid_exit(p);
 		mm_clear_owner(p->mm, p);
 		mmput(p->mm);
 	}
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4729,8 +4729,11 @@ void sched_cancel_fork(struct task_struct *p)
 	scx_cancel_fork(p);
 }

+static void sched_mm_cid_fork(struct task_struct *t);
+
 void sched_post_fork(struct task_struct *p)
 {
+	sched_mm_cid_fork(p);
 	uclamp_post_fork(p);
 	scx_post_fork(p);
 }
@@ -10646,12 +10649,13 @@ static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
 	 * possible switch back to per task mode happens either in the
 	 * deferred handler function or in the next fork()/exit().
 	 *
-	 * The caller has already transferred. The newly incoming task is
-	 * already accounted for, but not yet visible.
+	 * The caller has already transferred so remove it from the users
+	 * count. The incoming task is already visible and has mm_cid.active,
+	 * but has task::mm_cid::cid == UNSET. Still it needs to be accounted
+	 * for. Concurrent fork()s might add more threads, but all of them have
+	 * task::mm_cid::active = 0, so they don't affect the accounting here.
 	 */
-	users = mm->mm_cid.users - 2;
-	if (!users)
-		return;
+	users = mm->mm_cid.users - 1;

 	guard(rcu)();
 	for_other_threads(current, t) {
@@ -10688,12 +10692,15 @@ static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
 	return mm_update_max_cids(mm);
 }

-void sched_mm_cid_fork(struct task_struct *t)
+static void sched_mm_cid_fork(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
 	bool percpu;

-	WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
+	if (!mm)
+		return;
+
+	WARN_ON_ONCE(t->mm_cid.cid != MM_CID_UNSET);

 	guard(mutex)(&mm->mm_cid.mutex);
 	scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
@@ -10885,6 +10892,7 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 }
 #else /* CONFIG_SCHED_MM_CID */
 static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
+static inline void sched_mm_cid_fork(struct task_struct *t) { }
 #endif /* !CONFIG_SCHED_MM_CID */

 static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);