Files
linux/include/linux/rseq.h
Thomas Gleixner b9eac6a9d9 rseq: Revert to historical performance killing behaviour
The recent RSEQ optimization work broke the TCMalloc abuse of the RSEQ ABI
as it not longer unconditionally updates the CPU, node, mm_cid fields,
which are documented as read only for user space. Due to the observed
behavior of the kernel it was possible for TCMalloc to overwrite the
cpu_id_start field for their own purposes and rely on the kernel to update
it unconditionally after each context switch and before signal delivery.

The RSEQ ABI only guarantees that these fields are updated when the data
changes, i.e. the task is migrated or the MMCID of the task changes due to
switching from or to per CPU ownership mode.

The optimization work eliminated the unconditional updates and reduced them
to the documented ABI guarantees, which results in a massive performance
win for syscall, scheduling heavy work loads, which in turn breaks the
TCMalloc expectations.

There have been several options discussed to restore the TCMalloc
functionality while preserving the optimization benefits. They all end up
in a series of hard to maintain workarounds, which in the worst case
introduce overhead for everyone, e.g. in the scheduler.

The requirements of TCMalloc and the optimization work are diametral and
the required work arounds are a maintainence burden. They end up as fragile
constructs, which are blocking further optimization work and are pretty
much guaranteed to cause more subtle issues down the road.

The optimization work heavily depends on the generic entry code, which is
not used by all architectures yet. So the rework preserved the original
mechanism moslty unmodified to keep the support for architectures, which
handle rseq in their own exit to user space loop. That code is currently
optimized out by the compiler on architectures which use the generic entry
code.

This allows to revert back to the original behaviour by replacing the
compile time constant conditions with a runtime condition where required,
which disables the optimization and the dependend time slice extension
feature until the run-time condition can be enabled in the RSEQ
registration code on a per task basis again.

The following changes are required to restore the original behavior, which
makes TCMalloc work again:

  1) Replace the compile time constant conditionals with runtime
     conditionals where appropriate to prevent the compiler from optimizing
     the legacy mode out

  2) Enforce unconditional update of IDs on context switch for the
     non-optimized v1 mode

  3) Enforce update of IDs in the pre signal delivery path for the
     non-optimized v1 mode

  4) Enforce update of IDs in the membarrier(RSEQ) IPI for the
     non-optimized v1 mode

  5) Make time slice and future extensions depend on optimized v2 mode

This brings back the full performance problems, but preserves the v2
optimization code and for generic entry code using architectures also the
TIF_RSEQ optimization which avoids a full evaluation of the exit to user
mode loop in many cases.

Fixes: 566d8015f7 ("rseq: Avoid CPU/MM CID updates when no event pending")
Reported-by: Mathias Stearn <mathias@mongodb.com>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Closes: https://lore.kernel.org/CAHnCjA25b+nO2n5CeifknSKHssJpPrjnf+dtr7UgzRw4Zgu=oA@mail.gmail.com
Link: https://patch.msgid.link/20260428224427.517051752%40kernel.org
Cc: stable@vger.kernel.org
2026-05-05 16:02:57 +02:00

205 lines
6.3 KiB
C

/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
#ifndef _LINUX_RSEQ_H
#define _LINUX_RSEQ_H
#ifdef CONFIG_RSEQ
#include <linux/sched.h>
#include <uapi/linux/rseq.h>
void __rseq_handle_slowpath(struct pt_regs *regs);
static __always_inline bool rseq_v2(struct task_struct *t)
{
return IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && likely(t->rseq.event.has_rseq > 1);
}
/* Invoked from resume_user_mode_work() */
static inline void rseq_handle_slowpath(struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
if (current->rseq.event.slowpath)
__rseq_handle_slowpath(regs);
} else {
if (current->rseq.event.sched_switch && current->rseq.event.has_rseq)
__rseq_handle_slowpath(regs);
}
}
void __rseq_signal_deliver(int sig, struct pt_regs *regs);
/*
* Invoked from signal delivery to fixup based on the register context before
* switching to the signal delivery context.
*/
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
{
if (rseq_v2(current)) {
/* has_rseq is implied in rseq_v2() */
if (current->rseq.event.user_irq)
__rseq_signal_deliver(ksig->sig, regs);
} else {
if (current->rseq.event.has_rseq)
__rseq_signal_deliver(ksig->sig, regs);
}
}
static inline void rseq_raise_notify_resume(struct task_struct *t)
{
set_tsk_thread_flag(t, TIF_RSEQ);
}
/* Invoked from context switch to force evaluation on exit to user */
static __always_inline void rseq_sched_switch_event(struct task_struct *t)
{
struct rseq_event *ev = &t->rseq.event;
/*
* Only apply the user_irq optimization for RSEQ ABI V2 registrations.
* Legacy users like TCMalloc rely on the original ABI V1 behaviour
* which updates IDs on every context swtich.
*/
if (rseq_v2(t)) {
/*
* Avoid a boat load of conditionals by using simple logic to
* determine whether TIF_NOTIFY_RESUME or TIF_RSEQ needs to be
* raised.
*
* It's required when the CPU or MM CID has changed or the entry
* was via interrupt from user space. ev->has_rseq does not have
* to be evaluated here because rseq_v2() implies has_rseq.
*/
bool raise = ev->user_irq | ev->ids_changed;
if (raise) {
ev->sched_switch = true;
rseq_raise_notify_resume(t);
}
} else {
if (ev->has_rseq) {
t->rseq.event.ids_changed = true;
t->rseq.event.sched_switch = true;
rseq_raise_notify_resume(t);
}
}
}
/*
* Invoked from __set_task_cpu() when a task migrates or from
* mm_cid_schedin() when the CID changes to enforce an IDs update.
*
* This does not raise TIF_NOTIFY_RESUME as that happens in
* rseq_sched_switch_event().
*/
static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
{
t->rseq.event.ids_changed = true;
}
/* Enforce a full update after RSEQ registration and when execve() failed */
static inline void rseq_force_update(void)
{
if (current->rseq.event.has_rseq) {
current->rseq.event.ids_changed = true;
current->rseq.event.sched_switch = true;
rseq_raise_notify_resume(current);
}
}
/*
* KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
* which clears TIF_NOTIFY_RESUME on architectures that don't use the
* generic TIF bits and therefore can't provide a separate TIF_RSEQ flag.
*
* To avoid updating user space RSEQ in that case just to do it eventually
* again before returning to user space, because __rseq_handle_slowpath()
* does nothing when invoked with NULL register state.
*
* After returning from guest mode, before exiting to userspace, hypervisors
* must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
*/
static inline void rseq_virt_userspace_exit(void)
{
/*
* The generic optimization for deferring RSEQ updates until the next
* exit relies on having a dedicated TIF_RSEQ.
*/
if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
current->rseq.event.sched_switch)
rseq_raise_notify_resume(current);
}
static inline void rseq_reset(struct task_struct *t)
{
/* Protect against preemption and membarrier IPI */
guard(irqsave)();
memset(&t->rseq, 0, sizeof(t->rseq));
t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
}
static inline void rseq_execve(struct task_struct *t)
{
rseq_reset(t);
}
/*
* If parent process has a registered restartable sequences area, the
* child inherits. Unregister rseq for a clone with CLONE_VM set.
*
* On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
* on the COW page on exit to user space, when the child stays on the same
* CPU as the parent. That's obviously not guaranteed, but in overcommit
* scenarios it is more likely and optimizes for the fork/exec case without
* taking the fault.
*/
static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
{
if (clone_flags & CLONE_VM)
rseq_reset(t);
else
t->rseq = current->rseq;
}
/*
* Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq
* registration. This is the active rseq area size rounded up to next
* power of 2, which guarantees that the rseq structure will always be
* aligned on the nearest power of two large enough to contain it, even
* as it grows.
*/
static inline unsigned int rseq_alloc_align(void)
{
return 1U << get_count_order(offsetof(struct rseq, end));
}
#else /* CONFIG_RSEQ */
static inline bool rseq_v2(struct task_struct *t) { return false; }
static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
static inline void rseq_sched_switch_event(struct task_struct *t) { }
static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
static inline void rseq_force_update(void) { }
static inline void rseq_virt_userspace_exit(void) { }
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
static inline void rseq_execve(struct task_struct *t) { }
#endif /* !CONFIG_RSEQ */
#ifdef CONFIG_DEBUG_RSEQ
void rseq_syscall(struct pt_regs *regs);
#else /* CONFIG_DEBUG_RSEQ */
static inline void rseq_syscall(struct pt_regs *regs) { }
#endif /* !CONFIG_DEBUG_RSEQ */
#ifdef CONFIG_RSEQ_SLICE_EXTENSION
void rseq_syscall_enter_work(long syscall);
int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
#else /* CONFIG_RSEQ_SLICE_EXTENSION */
static inline void rseq_syscall_enter_work(long syscall) { }
static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
{
return -ENOTSUPP;
}
#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
#endif /* _LINUX_RSEQ_H */