From cb48828f06afa232cc330f0f4d6be101067810b3 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 23 Apr 2026 20:17:45 +0100 Subject: [PATCH 01/13] selftests/rseq: Don't run tests with runner scripts outside of the scripts The rseq selftests include two runner scripts run_param_test.sh and run_syscall_errors_test.sh which set up the environment for test binaries and run them with various parameters. Currently we list these test binaries in TEST_GEN_PROGS but this results in the kselftest framework running them directly as well as via the runners, resulting in duplication and spurious failures when the environment is not correctly set up (eg, if glibc tries to use rseq). Move the binaries the runners invoke to TEST_GEN_PROGS_EXTENDED, binaries listed there are built but not run by the framework. The param_test benchmarks are not moved since they are not run by run_param_test.sh. Fixes: 830969e7821a ("selftests/rseq: Implement time slice extension test") Signed-off-by: Mark Brown Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260423-selftests-rseq-use-runner-v1-1-e13a133754c1@kernel.org Cc: stable@vger.kernel.org --- tools/testing/selftests/rseq/Makefile | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 4ef90823b652..0d1947c0d623 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -14,12 +14,15 @@ LDLIBS += -lpthread -ldl # still track changes to header files and depend on shared object. OVERRIDE_TARGETS = 1 -TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \ - param_test_benchmark param_test_compare_twice param_test_mm_cid \ - param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \ - syscall_errors_test slice_test +TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test \ + param_test_benchmark param_test_mm_cid_benchmark slice_test -TEST_GEN_PROGS_EXTENDED = librseq.so +TEST_GEN_PROGS_EXTENDED = librseq.so \ + param_test \ + param_test_compare_twice \ + param_test_mm_cid \ + param_test_mm_cid_compare_twice \ + syscall_errors_test TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh From 2cb68e45120dfc66404c7547d95b8ac6ff0b25ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Apr 2026 10:10:19 +0200 Subject: [PATCH 02/13] rseq: Set rseq::cpu_id_start to 0 on unregistration The RSEQ rework changed that to RSEQ_CPU_UNINITILIZED, which is obviously incompatible. Revert back to the original behavior. Fixes: 0f085b41880e ("rseq: Provide and use rseq_set_ids()") Reported-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.271566313%40kernel.org Cc: stable@vger.kernel.org --- kernel/rseq.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/kernel/rseq.c b/kernel/rseq.c index 38d3ef540760..b9f11931ef78 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -236,11 +236,6 @@ static int __init rseq_debugfs_init(void) } __initcall(rseq_debugfs_init); -static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) -{ - return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); -} - static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) { struct rseq __user *urseq = t->rseq.usrptr; @@ -384,19 +379,22 @@ void rseq_syscall(struct pt_regs *regs) static bool rseq_reset_ids(void) { - struct rseq_ids ids = { - .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, - .mm_cid = 0, - }; + struct rseq __user *rseq = current->rseq.usrptr; /* * If this fails, terminate it because this leaves the kernel in * stupid state as exit to user space will try to fixup the ids * again. */ - if (rseq_set_ids(current, &ids, 0)) - return true; + scoped_user_rw_access(rseq, efault) { + unsafe_put_user(0, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0, &rseq->node_id, efault); + unsafe_put_user(0, &rseq->mm_cid, efault); + } + return true; +efault: force_sig(SIGSEGV); return false; } From e9766e6f7d330dce7530918d8c6e3ec96d6c6e24 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Apr 2026 10:14:41 +0200 Subject: [PATCH 03/13] rseq: Protect rseq_reset() against interrupts rseq_reset() uses memset() to clear the tasks rseq data. That's racy against membarrier() and preemption. Guard it with irqsave to cure this. Fixes: faba9d250eae ("rseq: Introduce struct rseq_data") Reported-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.353887714%40kernel.org Cc: stable@vger.kernel.org --- include/linux/rseq.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index b9d62fc2140d..f446909551df 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -119,6 +119,8 @@ static inline void rseq_virt_userspace_exit(void) static inline void rseq_reset(struct task_struct *t) { + /* Protect against preemption and membarrier IPI */ + guard(irqsave)(); memset(&t->rseq, 0, sizeof(t->rseq)); t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; } From 010b7723c0a3b9ad58f50b715dbe2e7781d29400 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Apr 2026 09:34:45 +0200 Subject: [PATCH 04/13] rseq: Don't advertise time slice extensions if disabled If time slice extensions have been disabled on the kernel command line, then advertising them in RSEQ flags is wrong. Adjust the conditionals to reflect reality, fixup the misleading comments about the gap of these flags and the rseq::flags field. Fixes: d6200245c75e ("rseq: Allow registering RSEQ with slice extension") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.437059375%40kernel.org Cc: stable@vger.kernel.org --- include/uapi/linux/rseq.h | 5 ++++- kernel/rseq.c | 9 +++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index f69344fe6c08..ca6fe1f9d05e 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -28,7 +28,7 @@ enum rseq_cs_flags_bit { RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, - /* (3) Intentional gap to put new bits into a separate byte */ + /* (3) Intentional gap to keep new bits separate */ /* User read only feature flags */ RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT = 4, @@ -161,6 +161,9 @@ struct rseq { * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE + * + * It is now used for feature status advertisement by the kernel. + * See: enum rseq_cs_flags_bit for further information. */ __u32 flags; diff --git a/kernel/rseq.c b/kernel/rseq.c index b9f11931ef78..586f58f652c6 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -462,10 +462,11 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 return -EFAULT; if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { - rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; - if (rseq_slice_extension_enabled() && - (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)) - rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + if (rseq_slice_extension_enabled()) { + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + } } scoped_user_write_access(rseq, efault) { From b9eac6a9d93c952c4b7775a24d5c7a1bbf4c3c00 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Apr 2026 00:47:54 +0200 Subject: [PATCH 05/13] rseq: Revert to historical performance killing behaviour The recent RSEQ optimization work broke the TCMalloc abuse of the RSEQ ABI as it not longer unconditionally updates the CPU, node, mm_cid fields, which are documented as read only for user space. Due to the observed behavior of the kernel it was possible for TCMalloc to overwrite the cpu_id_start field for their own purposes and rely on the kernel to update it unconditionally after each context switch and before signal delivery. The RSEQ ABI only guarantees that these fields are updated when the data changes, i.e. the task is migrated or the MMCID of the task changes due to switching from or to per CPU ownership mode. The optimization work eliminated the unconditional updates and reduced them to the documented ABI guarantees, which results in a massive performance win for syscall, scheduling heavy work loads, which in turn breaks the TCMalloc expectations. There have been several options discussed to restore the TCMalloc functionality while preserving the optimization benefits. They all end up in a series of hard to maintain workarounds, which in the worst case introduce overhead for everyone, e.g. in the scheduler. The requirements of TCMalloc and the optimization work are diametral and the required work arounds are a maintainence burden. They end up as fragile constructs, which are blocking further optimization work and are pretty much guaranteed to cause more subtle issues down the road. The optimization work heavily depends on the generic entry code, which is not used by all architectures yet. So the rework preserved the original mechanism moslty unmodified to keep the support for architectures, which handle rseq in their own exit to user space loop. That code is currently optimized out by the compiler on architectures which use the generic entry code. This allows to revert back to the original behaviour by replacing the compile time constant conditions with a runtime condition where required, which disables the optimization and the dependend time slice extension feature until the run-time condition can be enabled in the RSEQ registration code on a per task basis again. The following changes are required to restore the original behavior, which makes TCMalloc work again: 1) Replace the compile time constant conditionals with runtime conditionals where appropriate to prevent the compiler from optimizing the legacy mode out 2) Enforce unconditional update of IDs on context switch for the non-optimized v1 mode 3) Enforce update of IDs in the pre signal delivery path for the non-optimized v1 mode 4) Enforce update of IDs in the membarrier(RSEQ) IPI for the non-optimized v1 mode 5) Make time slice and future extensions depend on optimized v2 mode This brings back the full performance problems, but preserves the v2 optimization code and for generic entry code using architectures also the TIF_RSEQ optimization which avoids a full evaluation of the exit to user mode loop in many cases. Fixes: 566d8015f7ee ("rseq: Avoid CPU/MM CID updates when no event pending") Reported-by: Mathias Stearn Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Closes: https://lore.kernel.org/CAHnCjA25b+nO2n5CeifknSKHssJpPrjnf+dtr7UgzRw4Zgu=oA@mail.gmail.com Link: https://patch.msgid.link/20260428224427.517051752%40kernel.org Cc: stable@vger.kernel.org --- include/linux/rseq.h | 35 ++++++++++++++++++++++----------- include/linux/rseq_entry.h | 39 +++++++++++++++++++++++++++---------- include/linux/rseq_types.h | 9 ++++++++- kernel/rseq.c | 40 +++++++++++++++++++++++++++++++------- kernel/sched/membarrier.c | 11 ++++++++++- 5 files changed, 104 insertions(+), 30 deletions(-) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index f446909551df..7ef79b25e714 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -9,6 +9,11 @@ void __rseq_handle_slowpath(struct pt_regs *regs); +static __always_inline bool rseq_v2(struct task_struct *t) +{ + return IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && likely(t->rseq.event.has_rseq > 1); +} + /* Invoked from resume_user_mode_work() */ static inline void rseq_handle_slowpath(struct pt_regs *regs) { @@ -16,8 +21,7 @@ static inline void rseq_handle_slowpath(struct pt_regs *regs) if (current->rseq.event.slowpath) __rseq_handle_slowpath(regs); } else { - /* '&' is intentional to spare one conditional branch */ - if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) + if (current->rseq.event.sched_switch && current->rseq.event.has_rseq) __rseq_handle_slowpath(regs); } } @@ -30,9 +34,9 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs); */ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { - /* '&' is intentional to spare one conditional branch */ - if (current->rseq.event.has_rseq & current->rseq.event.user_irq) + if (rseq_v2(current)) { + /* has_rseq is implied in rseq_v2() */ + if (current->rseq.event.user_irq) __rseq_signal_deliver(ksig->sig, regs); } else { if (current->rseq.event.has_rseq) @@ -50,15 +54,22 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t) { struct rseq_event *ev = &t->rseq.event; - if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* + * Only apply the user_irq optimization for RSEQ ABI V2 registrations. + * Legacy users like TCMalloc rely on the original ABI V1 behaviour + * which updates IDs on every context swtich. + */ + if (rseq_v2(t)) { /* - * Avoid a boat load of conditionals by using simple logic - * to determine whether NOTIFY_RESUME needs to be raised. + * Avoid a boat load of conditionals by using simple logic to + * determine whether TIF_NOTIFY_RESUME or TIF_RSEQ needs to be + * raised. * - * It's required when the CPU or MM CID has changed or - * the entry was from user space. + * It's required when the CPU or MM CID has changed or the entry + * was via interrupt from user space. ev->has_rseq does not have + * to be evaluated here because rseq_v2() implies has_rseq. */ - bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq; + bool raise = ev->user_irq | ev->ids_changed; if (raise) { ev->sched_switch = true; @@ -66,6 +77,7 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t) } } else { if (ev->has_rseq) { + t->rseq.event.ids_changed = true; t->rseq.event.sched_switch = true; rseq_raise_notify_resume(t); } @@ -161,6 +173,7 @@ static inline unsigned int rseq_alloc_align(void) } #else /* CONFIG_RSEQ */ +static inline bool rseq_v2(struct task_struct *t) { return false; } static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index f11ebd34f8b9..934db41ec782 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -111,6 +111,20 @@ static __always_inline void rseq_slice_clear_grant(struct task_struct *t) t->rseq.slice.state.granted = false; } +/* + * Open coded, so it can be invoked within a user access region. + * + * This clears the user space state of the time slice extensions field only when + * the task has registered the optimized RSEQ_ABI V2. Some legacy registrations, + * e.g. TCMalloc, have conflicting non-ABI fields in struct RSEQ, which would be + * overwritten by an unconditional write. + */ +#define rseq_slice_clear_user(rseq, efault) \ +do { \ + if (rseq_slice_extension_enabled()) \ + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); \ +} while (0) + static __always_inline bool __rseq_grant_slice_extension(bool work_pending) { struct task_struct *curr = current; @@ -230,6 +244,7 @@ static __always_inline bool rseq_slice_extension_enabled(void) { return false; } static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; } static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { } static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } +#define rseq_slice_clear_user(rseq, efault) do { } while (0) #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); @@ -517,11 +532,9 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, if (csaddr) unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); - /* Open coded, so it's in the same user access region */ - if (rseq_slice_extension_enabled()) { - /* Unconditionally clear it, no point in conditionals */ - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); - } + /* RSEQ ABI V2 only operations */ + if (rseq_v2(t)) + rseq_slice_clear_user(rseq, efault); } rseq_slice_clear_grant(t); @@ -612,6 +625,14 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t * interrupts disabled */ guard(pagefault)(); + /* + * This optimization is only valid when the task registered for the + * optimized RSEQ_ABI_V2 variant. Some legacy users rely on the original + * RSEQ implementation behaviour which unconditionally updated the IDs. + * rseq_sched_switch_event() ensures that legacy registrations always + * have both sched_switch and ids_changed set, which is compatible with + * the historical TIF_NOTIFY_RESUME behaviour. + */ if (likely(!t->rseq.event.ids_changed)) { struct rseq __user *rseq = t->rseq.usrptr; /* @@ -623,11 +644,9 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t scoped_user_rw_access(rseq, efault) { unsafe_get_user(csaddr, &rseq->rseq_cs, efault); - /* Open coded, so it's in the same user access region */ - if (rseq_slice_extension_enabled()) { - /* Unconditionally clear it, no point in conditionals */ - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); - } + /* RSEQ ABI V2 only operations */ + if (rseq_v2(t)) + rseq_slice_clear_user(rseq, efault); } rseq_slice_clear_grant(t); diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 0b42045988db..a469c1870849 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -9,6 +9,12 @@ #ifdef CONFIG_RSEQ struct rseq; +/* + * rseq_event::has_rseq contains the ABI version number so preserving it + * in AND operations requires a mask. + */ +#define RSEQ_HAS_RSEQ_VERSION_MASK 0xff + /** * struct rseq_event - Storage for rseq related event management * @all: Compound to initialize and clear the data efficiently @@ -17,7 +23,8 @@ struct rseq; * exit to user * @ids_changed: Indicator that IDs need to be updated * @user_irq: True on interrupt entry from user mode - * @has_rseq: True if the task has a rseq pointer installed + * @has_rseq: Greater than 0 if the task has a rseq pointer installed. + * Contains the RSEQ version number * @error: Compound error code for the slow path to analyze * @fatal: User space data corrupted or invalid * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME diff --git a/kernel/rseq.c b/kernel/rseq.c index 586f58f652c6..aa25753ea135 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -253,11 +253,14 @@ static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) static void rseq_slowpath_update_usr(struct pt_regs *regs) { /* - * Preserve rseq state and user_irq state. The generic entry code - * clears user_irq on the way out, the non-generic entry - * architectures are not having user_irq. + * Preserve has_rseq and user_irq state. The generic entry code clears + * user_irq on the way out, the non-generic entry architectures are not + * setting user_irq. */ - const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; + const struct rseq_event evt_mask = { + .has_rseq = RSEQ_HAS_RSEQ_VERSION_MASK, + .user_irq = true, + }; struct task_struct *t = current; struct rseq_ids ids; u32 node_id; @@ -330,8 +333,9 @@ void __rseq_handle_slowpath(struct pt_regs *regs) void __rseq_signal_deliver(int sig, struct pt_regs *regs) { rseq_stat_inc(rseq_stats.signal); + /* - * Don't update IDs, they are handled on exit to user if + * Don't update IDs yet, they are handled on exit to user if * necessary. The important thing is to abort a critical section of * the interrupted context as after this point the instruction * pointer in @regs points to the signal handler. @@ -344,6 +348,13 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs) current->rseq.event.error = 0; force_sigsegv(sig); } + + /* + * In legacy mode, force the update of IDs before returning to user + * space to stay compatible. + */ + if (!rseq_v2(current)) + rseq_force_update(); } /* @@ -408,6 +419,7 @@ static bool rseq_reset_ids(void) SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { u32 rseqfl = 0; + u8 version = 1; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) @@ -461,7 +473,11 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (!access_ok(rseq, rseq_len)) return -EFAULT; - if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + /* + * The version check effectivly disables time slice extensions until the + * RSEQ ABI V2 registration are implemented. + */ + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION) && version > 1) { if (rseq_slice_extension_enabled()) { rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; if (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) @@ -484,7 +500,15 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); unsafe_put_user(0U, &rseq->node_id, efault); unsafe_put_user(0U, &rseq->mm_cid, efault); - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + + /* + * All fields past mm_cid are only valid for non-legacy v2 + * registrations. + */ + if (version > 1) { + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + } } /* @@ -712,6 +736,8 @@ int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) return -ENOTSUPP; if (!current->rseq.usrptr) return -ENXIO; + if (!rseq_v2(current)) + return -ENOTSUPP; /* No change? */ if (enable == !!current->rseq.slice.state.enabled) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 623445603725..226a6329f3e9 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -199,7 +199,16 @@ static void ipi_rseq(void *info) * is negligible. */ smp_mb(); - rseq_sched_switch_event(current); + /* + * Legacy mode requires that IDs are written and the critical section is + * evaluated. V2 optimized mode handles the critical section and IDs are + * only updated if they change as a consequence of preemption after + * return from this IPI. + */ + if (rseq_v2(current)) + rseq_sched_switch_event(current); + else + rseq_force_update(); } static void ipi_sync_rq_state(void *info) From 02b44d943b3adddc3a15c1da97045e205b7d14c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Apr 2026 15:46:06 +0200 Subject: [PATCH 06/13] selftests/rseq: Skip tests if time slice extensions are not available Don't fail, skip the test if the extensions are not enabled at compile or runtime. Fixes: 830969e7821a ("selftests/rseq: Implement time slice extension test") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.597838491%40kernel.org Cc: stable@vger.kernel.org --- tools/testing/selftests/rseq/slice_test.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/rseq/slice_test.c b/tools/testing/selftests/rseq/slice_test.c index 357122dcb487..77e668ff74d7 100644 --- a/tools/testing/selftests/rseq/slice_test.c +++ b/tools/testing/selftests/rseq/slice_test.c @@ -124,6 +124,13 @@ FIXTURE_SETUP(slice_ext) { cpu_set_t affinity; + if (rseq_register_current_thread()) + SKIP(return, "RSEQ not supported\n"); + + if (prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, + PR_RSEQ_SLICE_EXT_ENABLE, 0, 0)) + SKIP(return, "Time slice extension not supported\n"); + ASSERT_EQ(sched_getaffinity(0, sizeof(affinity), &affinity), 0); /* Pin it on a single CPU. Avoid CPU 0 */ @@ -137,11 +144,6 @@ FIXTURE_SETUP(slice_ext) break; } - ASSERT_EQ(rseq_register_current_thread(), 0); - - ASSERT_EQ(prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, - PR_RSEQ_SLICE_EXT_ENABLE, 0, 0), 0); - self->noise_params.noise_nsecs = variant->noise_nsecs; self->noise_params.sleep_nsecs = variant->sleep_nsecs; self->noise_params.run = 1; From d97cb2ef0b221b068e90b6058aa97faa0626bdab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Apr 2026 18:13:54 +0200 Subject: [PATCH 07/13] selftests/rseq: Make registration flexible for legacy and optimized mode rseq_register_current_thread() either uses the glibc registered RSEQ region or registers it's own region with the legacy size of 32 bytes. That worked so far, but becomes a problem when the kernel implements a distinction between legacy and performance optimized behavior based on the registration size as that does not allow to test both modes with the self test suite. Add two arguments to the function. One to enforce that the registration is not using libc provided mode and one to tell the registration to use the legacy size and not the kernel advertised size. Rename it and make the original one a inline wrapper which preserves the existing behavior. Fixes: 566d8015f7ee ("rseq: Avoid CPU/MM CID updates when no event pending") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.677889423%40kernel.org Cc: stable@vger.kernel.org --- tools/testing/selftests/rseq/rseq-abi.h | 7 ++++- tools/testing/selftests/rseq/rseq.c | 39 ++++++++++++------------- tools/testing/selftests/rseq/rseq.h | 8 ++++- 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h index ecef315204b2..5f4ea2152c2f 100644 --- a/tools/testing/selftests/rseq/rseq-abi.h +++ b/tools/testing/selftests/rseq/rseq-abi.h @@ -191,10 +191,15 @@ struct rseq_abi { */ struct rseq_abi_slice_ctrl slice_ctrl; + /* + * Place holder to push the size above 32 bytes. + */ + __u8 __reserved; + /* * Flexible array member at end of structure, after last feature field. */ char end[]; -} __attribute__((aligned(4 * sizeof(__u64)))); +} __attribute__((aligned(256))); #endif /* _RSEQ_ABI_H */ diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index a736727b83c1..be0d0a97031e 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -56,6 +56,7 @@ ptrdiff_t rseq_offset; * unsuccessful. */ unsigned int rseq_size = -1U; +static unsigned int rseq_alloc_size; /* Flags used during rseq registration. */ unsigned int rseq_flags; @@ -115,29 +116,17 @@ bool rseq_available(void) } } -/* The rseq areas need to be at least 32 bytes. */ -static -unsigned int get_rseq_min_alloc_size(void) -{ - unsigned int alloc_size = rseq_size; - - if (alloc_size < ORIG_RSEQ_ALLOC_SIZE) - alloc_size = ORIG_RSEQ_ALLOC_SIZE; - return alloc_size; -} - /* * Return the feature size supported by the kernel. * * Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE): * - * 0: Return ORIG_RSEQ_FEATURE_SIZE (20) + * 0: Return ORIG_RSEQ_FEATURE_SIZE (20) * > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE). * * It should never return a value below ORIG_RSEQ_FEATURE_SIZE. */ -static -unsigned int get_rseq_kernel_feature_size(void) +static unsigned int get_rseq_kernel_feature_size(void) { unsigned long auxv_rseq_feature_size, auxv_rseq_align; @@ -152,15 +141,24 @@ unsigned int get_rseq_kernel_feature_size(void) return ORIG_RSEQ_FEATURE_SIZE; } -int rseq_register_current_thread(void) +int __rseq_register_current_thread(bool nolibc, bool legacy) { + unsigned int size; int rc; if (!rseq_ownership) { /* Treat libc's ownership as a successful registration. */ - return 0; + return nolibc ? -EBUSY : 0; } - rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG); + + /* The minimal allocation size is 32, which is the legacy allocation size */ + size = get_rseq_kernel_feature_size(); + if (legacy || size < ORIG_RSEQ_ALLOC_SIZE) + rseq_alloc_size = ORIG_RSEQ_ALLOC_SIZE; + else + rseq_alloc_size = size; + + rc = sys_rseq(&__rseq.abi, rseq_alloc_size, 0, RSEQ_SIG); if (rc) { /* * After at least one thread has registered successfully @@ -179,9 +177,8 @@ int rseq_register_current_thread(void) * The first thread to register sets the rseq_size to mimic the libc * behavior. */ - if (RSEQ_READ_ONCE(rseq_size) == 0) { - RSEQ_WRITE_ONCE(rseq_size, get_rseq_kernel_feature_size()); - } + if (RSEQ_READ_ONCE(rseq_size) == 0) + RSEQ_WRITE_ONCE(rseq_size, size); return 0; } @@ -194,7 +191,7 @@ int rseq_unregister_current_thread(void) /* Treat libc's ownership as a successful unregistration. */ return 0; } - rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); + rc = sys_rseq(&__rseq.abi, rseq_alloc_size, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); if (rc) return -1; return 0; diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h index f51a5fdb0444..c62ebb9290c0 100644 --- a/tools/testing/selftests/rseq/rseq.h +++ b/tools/testing/selftests/rseq/rseq.h @@ -8,6 +8,7 @@ #ifndef RSEQ_H #define RSEQ_H +#include #include #include #include @@ -142,7 +143,12 @@ static inline struct rseq_abi *rseq_get_abi(void) * succeed. A restartable sequence executed from a non-registered * thread will always fail. */ -int rseq_register_current_thread(void); +int __rseq_register_current_thread(bool nolibc, bool legacy); + +static inline int rseq_register_current_thread(void) +{ + return __rseq_register_current_thread(false, false); +} /* * Unregister rseq for current thread. From fdf4eb632683bfc2840acebe62716cb468d43e10 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Apr 2026 17:51:07 +0200 Subject: [PATCH 08/13] selftests/rseq: Validate legacy behavior The RSEQ legacy mode behavior requires that the ID fields in the rseq region are unconditionally updated on every context switch and before signal delivery even if not required by the ABI specification. To ensure that this behavior is preserved for legacy users in the future, add a test which validates that with a sleep() and a signal sent to self. Provide a run script which prevents GLIBC from registering a RSEQ region, so that the test can register it's own legacy sized region. Fixes: 566d8015f7ee ("rseq: Avoid CPU/MM CID updates when no event pending") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.764705536%40kernel.org Cc: stable@vger.kernel.org --- tools/testing/selftests/rseq/Makefile | 5 +- tools/testing/selftests/rseq/legacy_check.c | 65 +++++++++++++++++++ .../selftests/rseq/run_legacy_check.sh | 4 ++ 3 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/rseq/legacy_check.c create mode 100755 tools/testing/selftests/rseq/run_legacy_check.sh diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 0d1947c0d623..0293a2f17f51 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -22,9 +22,10 @@ TEST_GEN_PROGS_EXTENDED = librseq.so \ param_test_compare_twice \ param_test_mm_cid \ param_test_mm_cid_compare_twice \ - syscall_errors_test + syscall_errors_test \ + legacy_check -TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh +TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh run_legacy_check.sh TEST_FILES := settings diff --git a/tools/testing/selftests/rseq/legacy_check.c b/tools/testing/selftests/rseq/legacy_check.c new file mode 100644 index 000000000000..3f7de4e28303 --- /dev/null +++ b/tools/testing/selftests/rseq/legacy_check.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include + +#include "rseq.h" + +#include "../kselftest_harness.h" + +FIXTURE(legacy) +{ +}; + +static int cpu_id_in_sigfn = -1; + +static void sigfn(int sig) +{ + struct rseq_abi *rs = rseq_get_abi(); + + cpu_id_in_sigfn = rs->cpu_id_start; +} + +FIXTURE_SETUP(legacy) +{ + int res = __rseq_register_current_thread(true, true); + + switch (res) { + case -ENOSYS: + SKIP(return, "RSEQ not enabled\n"); + case -EBUSY: + SKIP(return, "GLIBC owns RSEQ. Disable GLIBC RSEQ registration\n"); + default: + ASSERT_EQ(res, 0); + } + + ASSERT_NE(signal(SIGUSR1, sigfn), SIG_ERR); +} + +FIXTURE_TEARDOWN(legacy) +{ +} + +TEST_F(legacy, legacy_test) +{ + struct rseq_abi *rs = rseq_get_abi(); + + ASSERT_NE(rs, NULL); + + /* Overwrite rs::cpu_id_start */ + rs->cpu_id_start = -1; + sleep(1); + ASSERT_NE(rs->cpu_id_start, -1); + + rs->cpu_id_start = -1; + ASSERT_EQ(raise(SIGUSR1), 0); + ASSERT_NE(rs->cpu_id_start, -1); + ASSERT_NE(cpu_id_in_sigfn, -1); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/rseq/run_legacy_check.sh b/tools/testing/selftests/rseq/run_legacy_check.sh new file mode 100755 index 000000000000..5577b46ea092 --- /dev/null +++ b/tools/testing/selftests/rseq/run_legacy_check.sh @@ -0,0 +1,4 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +GLIBC_TUNABLES="${GLIBC_TUNABLES:-}:glibc.pthread.rseq=0" ./legacy_check From 82f572449cfe75f12ea985986da60e11f308f77d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Apr 2026 16:21:02 +0200 Subject: [PATCH 09/13] rseq: Implement read only ABI enforcement for optimized RSEQ V2 mode The optimized RSEQ V2 mode requires that user space adheres to the ABI specification and does not modify the read-only fields cpu_id_start, cpu_id, node_id and mm_cid behind the kernel's back. While the kernel does not rely on these fields, the adherence to this is a fundamental prerequisite to allow multiple entities, e.g. libraries, in an application to utilize the full potential of RSEQ without stepping on each other toes. Validate this adherence on every update of these fields. If the kernel detects that user space modified the fields, the application is force terminated. Fixes: d6200245c75e ("rseq: Allow registering RSEQ with slice extension") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.845230956%40kernel.org Cc: stable@vger.kernel.org --- include/linux/rseq_entry.h | 83 ++++++++++++++------------------------ include/linux/rseq_types.h | 4 +- kernel/rseq.c | 5 +-- 3 files changed, 35 insertions(+), 57 deletions(-) diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 934db41ec782..2d0295df5107 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -248,7 +248,6 @@ static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, un #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); -bool rseq_debug_validate_ids(struct task_struct *t); static __always_inline void rseq_note_user_irq_entry(void) { @@ -368,43 +367,6 @@ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, return false; } -/* - * On debug kernels validate that user space did not mess with it if the - * debug branch is enabled. - */ -bool rseq_debug_validate_ids(struct task_struct *t) -{ - struct rseq __user *rseq = t->rseq.usrptr; - u32 cpu_id, uval, node_id; - - /* - * On the first exit after registering the rseq region CPU ID is - * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! - */ - node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? - cpu_to_node(t->rseq.ids.cpu_id) : 0; - - scoped_user_read_access(rseq, efault) { - unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); - if (cpu_id != t->rseq.ids.cpu_id) - goto die; - unsafe_get_user(uval, &rseq->cpu_id, efault); - if (uval != cpu_id) - goto die; - unsafe_get_user(uval, &rseq->node_id, efault); - if (uval != node_id) - goto die; - unsafe_get_user(uval, &rseq->mm_cid, efault); - if (uval != t->rseq.ids.mm_cid) - goto die; - } - return true; -die: - t->rseq.event.fatal = true; -efault: - return false; -} - #endif /* RSEQ_BUILD_SLOW_PATH */ /* @@ -514,20 +476,32 @@ rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long c * faults in task context are fatal too. */ static rseq_inline -bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, - u32 node_id, u64 *csaddr) +bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, u64 *csaddr) { struct rseq __user *rseq = t->rseq.usrptr; - if (static_branch_unlikely(&rseq_debug_enabled)) { - if (!rseq_debug_validate_ids(t)) - return false; - } - scoped_user_rw_access(rseq, efault) { + /* Validate the R/O fields for debug and optimized mode */ + if (static_branch_unlikely(&rseq_debug_enabled) || rseq_v2(t)) { + u32 cpu_id, uval; + + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); + if (cpu_id != t->rseq.ids.cpu_id) + goto die; + unsafe_get_user(uval, &rseq->cpu_id, efault); + if (uval != cpu_id) + goto die; + unsafe_get_user(uval, &rseq->node_id, efault); + if (uval != t->rseq.ids.node_id) + goto die; + unsafe_get_user(uval, &rseq->mm_cid, efault); + if (uval != t->rseq.ids.mm_cid) + goto die; + } + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); - unsafe_put_user(node_id, &rseq->node_id, efault); + unsafe_put_user(ids->node_id, &rseq->node_id, efault); unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); if (csaddr) unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); @@ -539,10 +513,13 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, rseq_slice_clear_grant(t); /* Cache the new values */ - t->rseq.ids.cpu_cid = ids->cpu_cid; + t->rseq.ids = *ids; rseq_stat_inc(rseq_stats.ids); rseq_trace_update(t, ids); return true; + +die: + t->rseq.event.fatal = true; efault: return false; } @@ -552,11 +529,11 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, * is in a critical section. */ static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, - struct rseq_ids *ids, u32 node_id) + struct rseq_ids *ids) { u64 csaddr; - if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) + if (!rseq_set_ids_get_csaddr(t, ids, &csaddr)) return false; /* @@ -659,12 +636,12 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t } struct rseq_ids ids = { - .cpu_id = task_cpu(t), - .mm_cid = task_mm_cid(t), + .cpu_id = task_cpu(t), + .mm_cid = task_mm_cid(t), + .node_id = cpu_to_node(ids.cpu_id), }; - u32 node_id = cpu_to_node(ids.cpu_id); - return rseq_update_usr(t, regs, &ids, node_id); + return rseq_update_usr(t, regs, &ids); efault: return false; } diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index a469c1870849..85739a63e85e 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -66,8 +66,9 @@ struct rseq_event { * compiler emit a single compare on 64-bit * @cpu_id: The CPU ID which was written last to user space * @mm_cid: The MM CID which was written last to user space + * @node_id: The node ID which was written last to user space * - * @cpu_id and @mm_cid are updated when the data is written to user space. + * @cpu_id, @mm_cid and @node_id are updated when the data is written to user space. */ struct rseq_ids { union { @@ -77,6 +78,7 @@ struct rseq_ids { u32 mm_cid; }; }; + u32 node_id; }; /** diff --git a/kernel/rseq.c b/kernel/rseq.c index aa25753ea135..101612027f6a 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -263,7 +263,6 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs) }; struct task_struct *t = current; struct rseq_ids ids; - u32 node_id; bool event; if (unlikely(t->flags & PF_EXITING)) @@ -299,9 +298,9 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs) if (!event) return; - node_id = cpu_to_node(ids.cpu_id); + ids.node_id = cpu_to_node(ids.cpu_id); - if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { + if (unlikely(!rseq_update_usr(t, regs, &ids))) { /* * Clear the errors just in case this might survive magically, but * leave the rest intact. From 99428157dcf32fdac97355aa1cc1364dbc9e073c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Apr 2026 10:01:56 +0200 Subject: [PATCH 10/13] rseq: Reenable performance optimizations conditionally Due to the incompatibility with TCMalloc the RSEQ optimizations and extended features (time slice extensions) have been disabled and made run-time conditional. The original RSEQ implementation, which TCMalloc depends on, registers a 32 byte region (ORIG_RSEG_SIZE). This region has a 32 byte alignment requirement. The extension safe newer variant exposes the kernel RSEQ feature size via getauxval(AT_RSEQ_FEATURE_SIZE) and the alignment requirement via getauxval(AT_RSEQ_ALIGN). The alignment requirement is that the registered RSEQ region is aligned to the next power of two of the feature size. The kernel currently has a feature size of 33 bytes, which means the alignment requirement is 64 bytes. The TCMalloc RSEQ region is embedded into a cache line aligned data structure starting at offset 32 bytes so that bytes 28-31 and the cpu_id_start field at bytes 32-35 form a 64-bit little endian pointer with the top-most bit (63 set) to check whether the kernel has overwritten cpu_id_start with an actual CPU id value, which is guaranteed to not have the top most bit set. As this is part of their performance tuned magic, it's a pretty safe assumption, that TCMalloc won't use a larger RSEQ size. This allows the kernel to declare that registrations with a size greater than the original size of 32 bytes, which is the cases since time slice extensions got introduced, as RSEQ ABI v2 with the following differences to the original behaviour: 1) Unconditional updates of the user read only fields (CPU, node, MMCID) are removed. Those fields are only updated on registration, task migration and MMCID changes. 2) Unconditional evaluation of the criticial section pointer is removed. It's only evaluated when user space was interrupted and was scheduled out or before delivering a signal in the interrupted context. 3) The read/only requirement of the ID fields is enforced. When the kernel detects that userspace manipulated the fields, the process is terminated. This ensures that multiple entities (libraries) can utilize RSEQ without interfering. 4) Todays extended RSEQ feature (time slice extensions) and future extensions are only enabled in the v2 enabled mode. Registrations with the original size of 32 bytes operate in backwards compatible legacy mode without performance improvements and extended features. Unfortunately that also affects users of older GLIBC versions which register the original size of 32 bytes and do not evaluate the kernel required size in the auxiliary vector AT_RSEQ_FEATURE_SIZE. That's the result of the lack of enforcement in the original implementation and the unwillingness of a single entity to cooperate with the larger ecosystem for many years. Implement the required registration changes by restructuring the spaghetti code and adding the size/version check. Also add documentation about the differences of legacy and optimized RSEQ V2 mode. Thanks to Mathieu for pointing out the ORIG_RSEQ_SIZE constraints! Fixes: d6200245c75e ("rseq: Allow registering RSEQ with slice extension") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.927160119%40kernel.org Cc: stable@vger.kernel.org --- Documentation/userspace-api/rseq.rst | 94 ++++++++++++++++- kernel/rseq.c | 144 ++++++++++++++++----------- 2 files changed, 178 insertions(+), 60 deletions(-) diff --git a/Documentation/userspace-api/rseq.rst b/Documentation/userspace-api/rseq.rst index 3cd27a3c7c7e..8549a6c61531 100644 --- a/Documentation/userspace-api/rseq.rst +++ b/Documentation/userspace-api/rseq.rst @@ -24,6 +24,97 @@ Quick access to CPU number, node ID Allows to implement per CPU data efficiently. Documentation is in code and selftests. :( +Optimized RSEQ V2 +----------------- + +On architectures which utilize the generic entry code and generic TIF bits +the kernel supports runtime optimizations for RSEQ, which also enable +enhanced features like scheduler time slice extensions. + +To enable them a task has to register the RSEQ region with at least the +length advertised by getauxval(AT_RSEQ_FEATURE_SIZE). + +If existing binaries register with RSEQ_ORIG_SIZE (32 bytes), the kernel +keeps the legacy low performance mode enabled to fulfil the expectations +of existing users regarding the original RSEQ implementation behaviour. + +The following table documents the ABI and behavioral guarantees of the +legacy and the optimized V2 mode. + +.. list-table:: RSEQ modes + :header-rows: 1 + + * - Nr + - What + + - Legacy + - Optimized V2 + + * - 1 + - The cpu_id_start, cpu_id, node_id and mm_cid fields (User mode read + only) + .. Legacy + - Updated by the kernel unconditionally after each context switch and + before signal delivery + .. Optimized V2 + - Updated by the kernel if and only if they change, i.e. if the task + is migrated or mm_cid changes + + * - 2 + - The rseq_cs critical section field + .. Legacy + - Evaluated and handled unconditionally after each context switch and + before signal delivery + .. Optimized V2 + - Evaluated and handled conditionally only when user space was + interrupted and was scheduled out or before delivering a signal in + the interrupted context. + + * - 3 + - Read only fields + .. Legacy + - No strict enforcement except in debug mode + .. Optimized V2 + - Strict enforcement + + * - 4 + - membarrier(...RSEQ) + .. Legacy + - All running threads of the process are interrupted and the ID fields + are rewritten and eventually active critical sections are aborted + before they return to user space. All threads which are scheduled + out whether voluntary or not are covered by #1/#2 above. + .. Optimized V2 + - All running threads of the process are interrupted and eventually + active critical sections are aborted before these threads return to + user space. The ID fields are only updated if changed as a + consequence of the interrupt. All threads which are scheduled out + whether voluntary or not are covered by #1/#2 above. + + * - 5 + - Time slice extensions + .. Legacy + - Not supported + .. Optimized V2 + - Supported + +The legacy mode is obviously less performant as it does unconditional +updates and critical section checks even if not strictly required by the +ABI contract. That can't be changed anymore as some users depend on that +observed behavior, which in turn enables them to violate the ABI and +overwrite the cpu_id_start field for their own purposes. This is obviously +discouraged as it renders RSEQ incompatible with the intended usage and +breaks the expectation of other libraries in the same application. + +The ABI compliant optimized v2 mode, which respects the read only fields, +does not require unconditional updates and therefore is way more +performant. The kernel validates the read only fields for compliance. If +user space modifies them, the process is killed. Compliant usage allows +multiple libraries in the same application to benefit from the RSEQ +functionality without disturbing each other. The ABI compliant optimized v2 +mode also enables extended RSEQ features like time slice extensions. + + Scheduler time slice extensions ------------------------------- @@ -37,7 +128,8 @@ The prerequisites for this functionality are: * Enabled at boot time (default is enabled) - * A rseq userspace pointer has been registered for the thread + * A rseq userspace pointer has been registered for the thread in + optimized V2 mode The thread has to enable the functionality via prctl(2):: diff --git a/kernel/rseq.c b/kernel/rseq.c index 101612027f6a..e75e3a5e312c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -412,70 +412,23 @@ static bool rseq_reset_ids(void) /* The original rseq structure size (including padding) is 32 bytes. */ #define ORIG_RSEQ_SIZE 32 -/* - * sys_rseq - setup restartable sequences for caller thread. - */ -SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) +static long rseq_register(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) { u32 rseqfl = 0; u8 version = 1; - if (flags & RSEQ_FLAG_UNREGISTER) { - if (flags & ~RSEQ_FLAG_UNREGISTER) - return -EINVAL; - /* Unregister rseq for current thread. */ - if (current->rseq.usrptr != rseq || !current->rseq.usrptr) - return -EINVAL; - if (rseq_len != current->rseq.len) - return -EINVAL; - if (current->rseq.sig != sig) - return -EPERM; - if (!rseq_reset_ids()) - return -EFAULT; - rseq_reset(current); - return 0; - } - - if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))) - return -EINVAL; - - if (current->rseq.usrptr) { - /* - * If rseq is already registered, check whether - * the provided address differs from the prior - * one. - */ - if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) - return -EINVAL; - if (current->rseq.sig != sig) - return -EPERM; - /* Already registered. */ - return -EBUSY; - } - - /* - * If there was no rseq previously registered, ensure the provided rseq - * is properly aligned, as communcated to user-space through the ELF - * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq - * size, the required alignment is the original struct rseq alignment. - * - * The rseq_len is required to be greater or equal to the original rseq - * size. In order to be valid, rseq_len is either the original rseq size, - * or large enough to contain all supported fields, as communicated to - * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. - */ - if (rseq_len < ORIG_RSEQ_SIZE || - (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || - (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) || - rseq_len < offsetof(struct rseq, end)))) - return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; /* - * The version check effectivly disables time slice extensions until the - * RSEQ ABI V2 registration are implemented. + * Architectures, which use the generic IRQ entry code (at least) enable + * registrations with a size greater than the original v1 fixed sized + * @rseq_len, which has been validated already to utilize the optimized + * v2 ABI mode which also enables extended RSEQ features beyond MMCID. */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && rseq_len > ORIG_RSEQ_SIZE) + version = 2; + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION) && version > 1) { if (rseq_slice_extension_enabled()) { rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; @@ -523,11 +476,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 #endif /* - * If rseq was previously inactive, and has just been - * registered, ensure the cpu_id_start and cpu_id fields - * are updated before returning to user-space. + * Ensure the cpu_id_start and cpu_id fields are updated before + * returning to user-space. */ - current->rseq.event.has_rseq = true; + current->rseq.event.has_rseq = version; rseq_force_update(); return 0; @@ -535,6 +487,80 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 return -EFAULT; } +static long rseq_unregister(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) +{ + if (flags & ~RSEQ_FLAG_UNREGISTER) + return -EINVAL; + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) + return -EINVAL; + if (rseq_len != current->rseq.len) + return -EINVAL; + if (current->rseq.sig != sig) + return -EPERM; + if (!rseq_reset_ids()) + return -EFAULT; + rseq_reset(current); + return 0; +} + +static long rseq_reregister(struct rseq __user * rseq, u32 rseq_len, u32 sig) +{ + /* + * If rseq is already registered, check whether the provided address + * differs from the prior one. + */ + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) + return -EINVAL; + if (current->rseq.sig != sig) + return -EPERM; + /* Already registered. */ + return -EBUSY; +} + +static bool rseq_length_valid(struct rseq __user *rseq, unsigned int rseq_len) +{ + /* + * Ensure the provided rseq is properly aligned, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_ALIGN. If + * rseq_len is the original rseq size, the required alignment is the + * original struct rseq alignment. + * + * In order to be valid, rseq_len is either the original rseq size, or + * large enough to contain all supported fields, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. + */ + if (rseq_len < ORIG_RSEQ_SIZE) + return false; + + if (rseq_len == ORIG_RSEQ_SIZE) + return IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE); + + return IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) && + rseq_len >= offsetof(struct rseq, end); +} + +#define RSEQ_FLAGS_SUPPORTED (RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) + +/* + * sys_rseq - Register or unregister restartable sequences for the caller thread. + */ +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) +{ + if (flags & RSEQ_FLAG_UNREGISTER) + return rseq_unregister(rseq, rseq_len, flags, sig); + + if (unlikely(flags & ~RSEQ_FLAGS_SUPPORTED)) + return -EINVAL; + + if (current->rseq.usrptr) + return rseq_reregister(rseq, rseq_len, sig); + + if (!rseq_length_valid(rseq, rseq_len)) + return -EINVAL; + + return rseq_register(rseq, rseq_len, flags, sig); +} + #ifdef CONFIG_RSEQ_SLICE_EXTENSION struct slice_timer { struct hrtimer timer; From e744060076871eebc2647b24420b550ff44b2b65 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Apr 2026 14:48:23 +0200 Subject: [PATCH 11/13] selftests/rseq: Expand for optimized RSEQ ABI v2 Update the selftests so they are executed for legacy (32 bytes RSEQ region) and optimized RSEQ ABI v2 mode. Fixes: d6200245c75e ("rseq: Allow registering RSEQ with slice extension") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224428.009121296%40kernel.org Cc: stable@vger.kernel.org --- tools/testing/selftests/rseq/Makefile | 11 ++++-- .../testing/selftests/rseq/check_optimized.c | 17 ++++++++ tools/testing/selftests/rseq/param_test.c | 25 +++++++----- .../testing/selftests/rseq/run_param_test.sh | 39 +++++++++++++++++++ .../selftests/rseq/run_timeslice_test.sh | 14 +++++++ tools/testing/selftests/rseq/slice_test.c | 2 +- 6 files changed, 95 insertions(+), 13 deletions(-) create mode 100644 tools/testing/selftests/rseq/check_optimized.c create mode 100755 tools/testing/selftests/rseq/run_timeslice_test.sh diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 0293a2f17f51..50d69e22ee7a 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -15,7 +15,7 @@ LDLIBS += -lpthread -ldl OVERRIDE_TARGETS = 1 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test \ - param_test_benchmark param_test_mm_cid_benchmark slice_test + param_test_benchmark param_test_mm_cid_benchmark TEST_GEN_PROGS_EXTENDED = librseq.so \ param_test \ @@ -23,9 +23,11 @@ TEST_GEN_PROGS_EXTENDED = librseq.so \ param_test_mm_cid \ param_test_mm_cid_compare_twice \ syscall_errors_test \ - legacy_check + legacy_check \ + slice_test \ + check_optimized -TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh run_legacy_check.sh +TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh run_legacy_check.sh run_timeslice_test.sh TEST_FILES := settings @@ -66,3 +68,6 @@ $(OUTPUT)/syscall_errors_test: syscall_errors_test.c $(TEST_GEN_PROGS_EXTENDED) $(OUTPUT)/slice_test: slice_test.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ + +$(OUTPUT)/check_optimized: check_optimized.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h + $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ diff --git a/tools/testing/selftests/rseq/check_optimized.c b/tools/testing/selftests/rseq/check_optimized.c new file mode 100644 index 000000000000..a13e3f2c8fc6 --- /dev/null +++ b/tools/testing/selftests/rseq/check_optimized.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: LGPL-2.1 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "rseq.h" + +int main(int argc, char **argv) +{ + if (__rseq_register_current_thread(true, false)) + return -1; + return 0; +} diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c index 05d03e679e06..e1e98dbabe4b 100644 --- a/tools/testing/selftests/rseq/param_test.c +++ b/tools/testing/selftests/rseq/param_test.c @@ -38,7 +38,7 @@ static int opt_modulo, verbose; static int opt_yield, opt_signal, opt_sleep, opt_disable_rseq, opt_threads = 200, opt_disable_mod = 0, opt_test = 's'; - +static bool opt_rseq_legacy; static long long opt_reps = 5000; static __thread __attribute__((tls_model("initial-exec"))) @@ -281,9 +281,12 @@ unsigned int yield_mod_cnt, nr_abort; } \ } +#define rseq_no_glibc true + #else #define printf_verbose(fmt, ...) +#define rseq_no_glibc false #endif /* BENCHMARK */ @@ -481,7 +484,7 @@ void *test_percpu_spinlock_thread(void *arg) long long i, reps; if (!opt_disable_rseq && thread_data->reg && - rseq_register_current_thread()) + __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = thread_data->reps; for (i = 0; i < reps; i++) { @@ -558,7 +561,7 @@ void *test_percpu_inc_thread(void *arg) long long i, reps; if (!opt_disable_rseq && thread_data->reg && - rseq_register_current_thread()) + __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = thread_data->reps; for (i = 0; i < reps; i++) { @@ -712,7 +715,7 @@ void *test_percpu_list_thread(void *arg) long long i, reps; struct percpu_list *list = (struct percpu_list *)arg; - if (!opt_disable_rseq && rseq_register_current_thread()) + if (!opt_disable_rseq && __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = opt_reps; @@ -895,7 +898,7 @@ void *test_percpu_buffer_thread(void *arg) long long i, reps; struct percpu_buffer *buffer = (struct percpu_buffer *)arg; - if (!opt_disable_rseq && rseq_register_current_thread()) + if (!opt_disable_rseq && __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = opt_reps; @@ -1105,7 +1108,7 @@ void *test_percpu_memcpy_buffer_thread(void *arg) long long i, reps; struct percpu_memcpy_buffer *buffer = (struct percpu_memcpy_buffer *)arg; - if (!opt_disable_rseq && rseq_register_current_thread()) + if (!opt_disable_rseq && __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = opt_reps; @@ -1258,7 +1261,7 @@ void *test_membarrier_worker_thread(void *arg) const int iters = opt_reps; int i; - if (rseq_register_current_thread()) { + if (__rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) { fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n", errno, strerror(errno)); abort(); @@ -1323,7 +1326,7 @@ void *test_membarrier_manager_thread(void *arg) intptr_t expect_a = 0, expect_b = 0; int cpu_a = 0, cpu_b = 0; - if (rseq_register_current_thread()) { + if (__rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) { fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n", errno, strerror(errno)); abort(); @@ -1475,6 +1478,7 @@ static void show_usage(int argc, char **argv) printf(" [-D M] Disable rseq for each M threads\n"); printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement, membarrie(r)\n"); printf(" [-M] Push into buffer and memcpy buffer with memory barriers.\n"); + printf(" [-O] Test with optimized RSEQ\n"); printf(" [-v] Verbose output.\n"); printf(" [-h] Show this help.\n"); printf("\n"); @@ -1602,6 +1606,9 @@ int main(int argc, char **argv) case 'M': opt_mo = RSEQ_MO_RELEASE; break; + case 'L': + opt_rseq_legacy = true; + break; default: show_usage(argc, argv); goto error; @@ -1618,7 +1625,7 @@ int main(int argc, char **argv) if (set_signal_handler()) goto error; - if (!opt_disable_rseq && rseq_register_current_thread()) + if (!opt_disable_rseq && __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) goto error; if (!opt_disable_rseq && !rseq_validate_cpu_id()) { fprintf(stderr, "Error: cpu id getter unavailable\n"); diff --git a/tools/testing/selftests/rseq/run_param_test.sh b/tools/testing/selftests/rseq/run_param_test.sh index 8d31426ab41f..69a3fa049929 100755 --- a/tools/testing/selftests/rseq/run_param_test.sh +++ b/tools/testing/selftests/rseq/run_param_test.sh @@ -34,6 +34,11 @@ REPS=1000 SLOW_REPS=100 NR_THREADS=$((6*${NR_CPUS})) +# Prevent GLIBC from registering RSEQ so the selftest can run in legacy and +# performance optimized mode. +GLIBC_TUNABLES="${GLIBC_TUNABLES:-}:glibc.pthread.rseq=0" +export GLIBC_TUNABLES + function do_tests() { local i=0 @@ -103,6 +108,40 @@ function inject_blocking() NR_LOOPS= } +echo "Testing in legacy RSEQ mode" +echo "Yield injection (25%)" +inject_blocking -m 4 -y -L + +echo "Yield injection (50%)" +inject_blocking -m 2 -y -L + +echo "Yield injection (100%)" +inject_blocking -m 1 -y -L + +echo "Kill injection (25%)" +inject_blocking -m 4 -k -L + +echo "Kill injection (50%)" +inject_blocking -m 2 -k -L + +echo "Kill injection (100%)" +inject_blocking -m 1 -k -L + +echo "Sleep injection (1ms, 25%)" +inject_blocking -m 4 -s 1 -L + +echo "Sleep injection (1ms, 50%)" +inject_blocking -m 2 -s 1 -L + +echo "Sleep injection (1ms, 100%)" +inject_blocking -m 1 -s 1 -L + +./check_optimized || { + echo "Skipping optimized RSEQ mode test. Not supported"; + exit 0 +} + +echo "Testing in optimized RSEQ mode" echo "Yield injection (25%)" inject_blocking -m 4 -y diff --git a/tools/testing/selftests/rseq/run_timeslice_test.sh b/tools/testing/selftests/rseq/run_timeslice_test.sh new file mode 100755 index 000000000000..551ebed71ec6 --- /dev/null +++ b/tools/testing/selftests/rseq/run_timeslice_test.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ + +# Prevent GLIBC from registering RSEQ so the selftest can run in legacy +# and performance optimized mode. +GLIBC_TUNABLES="${GLIBC_TUNABLES:-}:glibc.pthread.rseq=0" +export GLIBC_TUNABLES + +./check_optimized || { + echo "Skipping optimized RSEQ mode test. Not supported"; + exit 0 +} + +./slice_test diff --git a/tools/testing/selftests/rseq/slice_test.c b/tools/testing/selftests/rseq/slice_test.c index 77e668ff74d7..e402d4440bc2 100644 --- a/tools/testing/selftests/rseq/slice_test.c +++ b/tools/testing/selftests/rseq/slice_test.c @@ -124,7 +124,7 @@ FIXTURE_SETUP(slice_ext) { cpu_set_t affinity; - if (rseq_register_current_thread()) + if (__rseq_register_current_thread(true, false)) SKIP(return, "RSEQ not supported\n"); if (prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, From b6eee96843e8d088200f01b035da98e72067c5fe Mon Sep 17 00:00:00 2001 From: Zhan Xusheng Date: Fri, 1 May 2026 12:40:06 +0200 Subject: [PATCH 12/13] sched/fair: Fix overflow in vruntime_eligible() Zhan Xusheng reported running into sporadic a s64 mult overflow in vruntime_eligible(). When constructing a worst case scenario: If you have cgroups, then you can have an entity of weight 2 (per calc_group_shares()), and its vlag should then be bounded by: (slice+TICK_NSEC) * NICE_0_LOAD, which is around 44 bits as per the comment on entity_key(). The other extreme is 100*NICE_0_LOAD, thus you get: {key, weight}[] := { puny: { (slice + TICK_NSEC) * NICE_0_LOAD, 2 }, max: { 0, 100*NICE_0_LOAD }, } The avg_vruntime() would end up being very close to 0 (which is zero_vruntime), so no real help making that more accurate. vruntime_eligible(puny) ends up with: avg = 2 * puny.key (+ 0) load = 2 + 100 * NICE_0_LOAD avg >= puny.key * load And that is: (slice + TICK_NSEC) * NICE_0_LOAD * NICE_0_LOAD * 100, which will overflow s64. Zhan suggested using __builtin_mul_overflow(), however after staring at compiler output for various architectures using godbolt, it seems that using an __int128 multiplication often results in better code. Specifically, a number of architectures already compute the __int128 product to determine the overflow. Eg. arm64 already has the 'smulh' instruction used. By explicitly doing an __int128 multiply, it will emit the 'mul; smulh' pattern, which modern cores can fuse (armv8-a clang-22.1.0). x86_64 has less branches (no OF handling). Since Linux has ARCH_SUPPORTS_INT128 to gate __int128 usage, also provide the __builtin_mul_overflow() variant as a fallback. [peterz: Changelog and __int128 bits] Fixes: 556146ce5e94 ("sched/fair: Avoid overflow in enqueue_entity()") Reported-by: Zhan Xusheng Closes: https://patch.msgid.link/20260415145742.10359-1-zhanxusheng%40xiaomi.com Signed-off-by: Zhan Xusheng Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260505103155.GN3102924%40noisy.programming.kicks-ass.net --- kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 728965851842..b91c8b294229 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -882,11 +882,11 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * * lag_i >= 0 -> V >= v_i * - * \Sum (v_i - v)*w_i - * V = ------------------ + v + * \Sum (v_i - v0)*w_i + * V = ------------------- + v0 * \Sum w_i * - * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) + * lag_i >= 0 -> \Sum (v_i - v0)*w_i >= (v_i - v0)*(\Sum w_i) * * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due * to the loss in precision caused by the division. @@ -894,7 +894,7 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->sum_w_vruntime; + s64 key, avg = cfs_rq->sum_w_vruntime; long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { @@ -904,7 +904,36 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load; + key = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); + + /* + * The worst case term for @key includes 'NSEC_TICK * NICE_0_LOAD' + * and @load obviously includes NICE_0_LOAD. NSEC_TICK is around 24 + * bits, while NICE_0_LOAD is 20 on 64bit and 10 otherwise. + * + * This gives that on 64bit the product will be at least 64bit which + * overflows s64, while on 32bit it will only be 44bits and should fit + * comfortably. + */ +#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_SUPPORTS_INT128 + /* This often results in simpler code than __builtin_mul_overflow(). */ + return avg >= (__int128)key * load; +#else + s64 rhs; + /* + * On overflow, the sign of key tells us the correct answer: a large + * positive key means vruntime >> V, so not eligible; a large negative + * key means vruntime << V, so eligible. + */ + if (check_mul_overflow(key, load, &rhs)) + return key <= 0; + + return avg >= rhs; +#endif +#else /* 32bit */ + return avg >= key * load; +#endif } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) From 9f6d929ee2c6f0266edb564bcd2bd47fd6e884a8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Sun, 3 May 2026 12:45:03 +0200 Subject: [PATCH 13/13] sched/fair: Fix wakeup_preempt_fair() for not waking up task Make sure to only call pick_next_entity() on an non-empty cfs_rq. The assumption that p is always enqueued and not delayed, is only true for wakeup. If p was moved while delayed, pick_next_entity() will dequeue it and the cfs might become empty. Test if there are still queued tasks before trying again to determine if p could be the next one to be picked. There are at least 2 cases: When cfs becomes idle, it tries to pull tasks but if those pulled tasks are delayed, they will be dequeued when attached to cfs. attach_tasks() -> attach_task() -> wakeup_preempt(rq, p, 0); A misfit task running on cfs A triggers a load balance to be pulled on a better cpu, the load balance on cfs B starts an active load balance to pulled the running misfit task. If there is a delayed dequeue task on cfs A, it can be pulled instead of the previously running misfit task. attach_one_task() -> attach_task() -> wakeup_preempt(rq, p, 0); Fixes: ac8e69e69363 ("sched/fair: Fix wakeup_preempt_fair() vs delayed dequeue") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260503104503.1732682-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b91c8b294229..3ebec186f982 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9174,9 +9174,10 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f /* * Because p is enqueued, nse being null can only mean that we - * dequeued a delayed task. + * dequeued a delayed task. If there are still entities queued in + * cfs, check if the next one will be p. */ - if (!nse) + if (!nse && cfs_rq->nr_queued) goto pick; if (sched_feat(RUN_TO_PARITY))