From f2e555fc04ba238cd1dd21040325fa0629ee433d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Apr 2025 15:43:00 +0200 Subject: [PATCH 01/49] rcu/exp: Protect against early QS report When a grace period is started, the ->expmask of each node is set up from sync_exp_reset_tree(). Then later on each leaf node also initialize its ->exp_tasks pointer. This means that the initialization of the quiescent state of a node and the initialization of its blocking tasks happen with an unlocked node gap in-between. It happens to be fine because nothing is expected to report an exp quiescent state within this gap, since no IPI have been issued yet and every rdp's ->cpu_no_qs.b.exp should be false. However if it were to happen by accident, the quiescent state could be reported and propagated while ignoring tasks that blocked _before_ the start of the grace period. Prevent such trouble to happen in the future and initialize both the quiescent states mask to report and the blocked tasks head from the same node locked block. If a task blocks within an RCU read side critical section before sync_exp_reset_tree() is called and is then unblocked between sync_exp_reset_tree() and __sync_rcu_exp_select_node_cpus(), the QS won't be reported because no RCU exp IPI had been issued to request it through the setting of srdp->cpu_no_qs.b.exp. Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_exp.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index c36c7d5575ca..2fa7aa9155bd 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -141,6 +141,13 @@ static void __maybe_unused sync_exp_reset_tree(void) raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); WRITE_ONCE(rnp->expmask, rnp->expmaskinit); + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited GP + * until such time as the ->expmask bits are cleared. + */ + if (rcu_is_leaf_node(rnp) && rcu_preempt_has_tasks(rnp)) + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -393,13 +400,6 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited GP - * until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ From 3dfdfaff2d49aa7895ced8a54e7d61755eac6830 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Apr 2025 12:08:40 +0200 Subject: [PATCH 02/49] rcu: Robustify rcu_is_cpu_rrupt_from_idle() RCU relies on the context tracking nesting counter in order to determine if it is running in extended quiescent state. However the context tracking nesting counter is not completely synchronized with the actual context tracking state: * The nesting counter is set to 1 or incremented further _after_ the actual state is set to RCU watching. * The nesting counter is set to 0 or decremented further _before_ the actual state is set to RCU not watching. Therefore it is safe to assume that if ct_nesting() > 0, RCU is watching. But if ct_nesting() <= 0, RCU is not watching except for tiny windows. This hasn't been a problem so far because rcu_is_cpu_rrupt_from_idle() has only been called from interrupts. However the code is confusing and abuses the role of the context tracking nesting counter while there are more accurate indicators available. Clarify and robustify accordingly. Signed-off-by: Frederic Weisbecker Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 14d4499c6fc3..f83bbb408895 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -377,7 +377,7 @@ EXPORT_SYMBOL_GPL(rcu_momentary_eqs); */ static int rcu_is_cpu_rrupt_from_idle(void) { - long nesting; + long nmi_nesting = ct_nmi_nesting(); /* * Usually called from the tick; but also used from smp_function_call() @@ -389,21 +389,28 @@ static int rcu_is_cpu_rrupt_from_idle(void) /* Check for counter underflows */ RCU_LOCKDEP_WARN(ct_nesting() < 0, "RCU nesting counter underflow!"); - RCU_LOCKDEP_WARN(ct_nmi_nesting() <= 0, - "RCU nmi_nesting counter underflow/zero!"); - /* Are we at first interrupt nesting level? */ - nesting = ct_nmi_nesting(); - if (nesting > 1) + /* Non-idle interrupt or nested idle interrupt */ + if (nmi_nesting > 1) return false; /* - * If we're not in an interrupt, we must be in the idle task! + * Non nested idle interrupt (interrupting section where RCU + * wasn't watching). */ - WARN_ON_ONCE(!nesting && !is_idle_task(current)); + if (nmi_nesting == 1) + return true; - /* Does CPU appear to be idle from an RCU standpoint? */ - return ct_nesting() == 0; + /* Not in an interrupt */ + if (!nmi_nesting) { + RCU_LOCKDEP_WARN(!in_task() || !is_idle_task(current), + "RCU nmi_nesting counter not in idle task!"); + return !rcu_is_watching_curr_cpu(); + } + + RCU_LOCKDEP_WARN(1, "RCU nmi_nesting counter underflow/zero!"); + + return false; } #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) From 9ea40db969115022335a553b4f6fe731dcd90c2a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 May 2025 16:45:02 -0700 Subject: [PATCH 03/49] rcutorture: Print only one rtort_pipe_count splat The rcu_torture_writer() function scans the memory blocks after a stutter (or forced idle) interval, complaining about any that have not passed through ten grace periods since the start of the stutter interval. But one splat suffices, so this commit therefore stops at the first splat. Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 70ec0f21abc3..d1e0d61d8815 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1722,6 +1722,7 @@ rcu_torture_writer(void *arg) cur_ops->gp_kthread_dbg(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); rcu_ftrace_dump(DUMP_ALL); + break; } if (stutter_waited) sched_set_normal(current, oldnice); From 635bdb9d22796fc6ba1958ec73ab0a0a693c58d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 May 2025 16:45:01 -0700 Subject: [PATCH 04/49] rcutorture: Start rcu_torture_writer() after rcu_torture_reader() Testing of rcutorture's SRCU-P scenario on a large arm64 system resulted in rcu_torture_writer() forward-progress failures, but these same tests passed on x86. After some off-list discussion of possible memory-ordering causes for these failures, Boqun showed that these were in fact due to reordering, but by the scheduler, not by the memory system. On x86, rcu_torture_writer() would have run quickly enough that by the time the rcu_torture_updown() kthread started, the rcu_torture_current variable would already be initialized, thus avoiding a bug in which a NULL value would cause rcu_torture_updown() to do an extra call to srcu_up_read_fast(). This commit therefore moves creation of the rcu_torture_writer() kthread after that of the rcu_torture_reader() kthreads. This results in deterministic failures on x86. What about the double-srcu_up_read_fast() bug? Boqun has the fix. But let's also fix the test while we are at it! Reported-by: Joel Fernandes Reported-by: Boqun Feng Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d1e0d61d8815..a209d2419cfd 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -4246,11 +4246,6 @@ rcu_torture_init(void) /* Start up the kthreads. */ rcu_torture_write_types(); - firsterr = torture_create_kthread(rcu_torture_writer, NULL, - writer_task); - if (torture_init_error(firsterr)) - goto unwind; - if (nrealfakewriters > 0) { fakewriter_tasks = kcalloc(nrealfakewriters, sizeof(fakewriter_tasks[0]), @@ -4283,6 +4278,12 @@ rcu_torture_init(void) if (torture_init_error(firsterr)) goto unwind; } + + firsterr = torture_create_kthread(rcu_torture_writer, NULL, + writer_task); + if (torture_init_error(firsterr)) + goto unwind; + nrealnocbers = nocbs_nthreads; if (WARN_ON(nrealnocbers < 0)) nrealnocbers = 1; From eec1f94cf77f8a7f7078365f5029a56b0ce16580 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Jun 2025 12:42:14 -0700 Subject: [PATCH 05/49] rcutorture: Make rcutorture_one_extend_check() account for hard IRQs This commit retrospectively prepares for testing of RCU readers invoked from hardware interrupt handlers (for example, HRTIMER_MODE_HARD hrtimer handlers) in kernels built with CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE=y, which is rarely used but sometimes extremely useful. This preparation involves taking early exits if in_hardirq(), and, while we are in the area, a very early exit if in_nmi(). This means that a number of insoftirq parameters are no longer needed, but that is the subject of a later commit. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202505140917.8ee62cc6-lkp@intel.com Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a209d2419cfd..7944cc69321d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1920,10 +1920,10 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, { int mask; - if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE)) + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE) || in_nmi()) return; - WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled(), ROEC_ARGS); + WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled() && !in_hardirq(), ROEC_ARGS); WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS); // If CONFIG_PREEMPT_COUNT=n, further checks are unreliable. @@ -1938,9 +1938,9 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, (curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && cur_ops->readlock_nesting() == 0, ROEC_ARGS); - // Timer handlers have all sorts of stuff disabled, so ignore + // Interrupt handlers have all sorts of stuff disabled, so ignore // unintended disabling. - if (insoftirq) + if (in_serving_softirq() || in_hardirq()) return; WARN_ONCE(cur_ops->extendables && From 1b67e031d478bf386282c7c5ff3f19cd9c1e5a39 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Feb 2025 11:37:55 -0800 Subject: [PATCH 06/49] rcutorture: Add tests for SRCU up/down reader primitives This commit adds a new rcutorture.n_up_down kernel boot parameter that specifies the number of outstanding SRCU up/down readers, which begin in kthread context and end in an hrtimer handler. There is a new kthread ("rcu_torture_updown") that scans an per-reader array looking for elements whose readers have ended. This kthread sleeps between one and two milliseconds between consecutive scans. [ paulmck: Apply kernel test robot feedback. ] [ paulmck: Apply Z qiang feedback. ] [ joel: Fix build error: hrtimer_init is replaced by hrtimer_setup. ] [ joel: Apply Boqun bug fix to drop extra up_read() call in rcu_torture_updown()]. Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes Tested-by: kernel test robot Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 225 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 206 insertions(+), 19 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7944cc69321d..bf87c0c4e77e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -55,22 +55,24 @@ MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); -/* Bits for ->extendables field, extendables param, and related definitions. */ -#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) -#define RCUTORTURE_RDR_SHIFT_2 16 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) -#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ -#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ -#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ -#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */ -#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */ -#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */ -#define RCUTORTURE_MAX_EXTEND \ +// Bits for ->extendables field, extendables param, and related definitions. +#define RCUTORTURE_RDR_SHIFT_1 8 // Put SRCU index in upper bits. +#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 16 // Put SRCU index in upper bits. +#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) +#define RCUTORTURE_RDR_BH 0x01 // Extend readers by disabling bh. +#define RCUTORTURE_RDR_IRQ 0x02 // ... disabling interrupts. +#define RCUTORTURE_RDR_PREEMPT 0x04 // ... disabling preemption. +#define RCUTORTURE_RDR_RBH 0x08 // ... rcu_read_lock_bh(). +#define RCUTORTURE_RDR_SCHED 0x10 // ... rcu_read_lock_sched(). +#define RCUTORTURE_RDR_RCU_1 0x20 // ... entering another RCU reader. +#define RCUTORTURE_RDR_RCU_2 0x40 // ... entering another RCU reader. +#define RCUTORTURE_RDR_UPDOWN 0x80 // ... up-read from task, down-read from timer. + // Note: Manual start, automatic end. +#define RCUTORTURE_RDR_NBITS 8 // Number of bits defined above. +#define RCUTORTURE_MAX_EXTEND \ (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ - RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) + RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) // Intentionally omit RCUTORTURE_RDR_UPDOWN. #define RCUTORTURE_RDR_ALLBITS \ (RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \ RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2) @@ -110,6 +112,7 @@ torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); +torture_param(int, n_up_down, 32, "# of concurrent up/down hrtimer-based RCU readers"); torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); @@ -156,6 +159,7 @@ static int nrealfakewriters; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; +static struct task_struct *updown_task; static struct task_struct **nocb_tasks; static struct task_struct *stats_task; static struct task_struct *fqs_task; @@ -378,6 +382,8 @@ struct rcu_torture_ops { void (*readunlock)(int idx); int (*readlock_held)(void); // lockdep. int (*readlock_nesting)(void); // actual nesting, if available, -1 if not. + int (*down_read)(void); + void (*up_read)(int idx); unsigned long (*get_gp_seq)(void); unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); @@ -427,6 +433,7 @@ struct rcu_torture_ops { int no_pi_lock; int debug_objects; int start_poll_irqsoff; + int have_up_down; const char *name; }; @@ -762,6 +769,50 @@ static int torture_srcu_read_lock_held(void) return srcu_read_lock_held(srcu_ctlp); } +static bool srcu_torture_have_up_down(void) +{ + int rf = reader_flavor; + + if (!rf) + rf = SRCU_READ_FLAVOR_NORMAL; + return !!(cur_ops->have_up_down & rf); +} + +static int srcu_torture_down_read(void) +{ + int idx; + struct srcu_ctr __percpu *scp; + + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + WARN_ON_ONCE(reader_flavor & (reader_flavor - 1)); + + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { + idx = srcu_down_read(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + return idx; + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_down_read_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + return idx << 3; + } + WARN_ON_ONCE(1); + return 0; +} + +static void srcu_torture_up_read(int idx) +{ + WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) + srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); + else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || + !(reader_flavor & SRCU_READ_FLAVOR_ALL)) + srcu_up_read(srcu_ctlp, idx & 0x1); + else + WARN_ON_ONCE(1); +} + static unsigned long srcu_torture_completed(void) { return srcu_batches_completed(srcu_ctlp); @@ -819,6 +870,8 @@ static struct rcu_torture_ops srcu_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .down_read = srcu_torture_down_read, + .up_read = srcu_torture_up_read, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, .gp_diff = rcu_seq_diff, @@ -839,6 +892,8 @@ static struct rcu_torture_ops srcu_ops = { .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, + .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST, .name = "srcu" }; @@ -864,6 +919,8 @@ static struct rcu_torture_ops srcud_ops = { .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, + .down_read = srcu_torture_down_read, + .up_read = srcu_torture_up_read, .get_gp_seq = srcu_torture_completed, .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, @@ -883,6 +940,8 @@ static struct rcu_torture_ops srcud_ops = { .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, + .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST, .name = "srcud" }; @@ -1994,7 +2053,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, first = idxold1 == 0; WARN_ON_ONCE(idxold2 < 0); - WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS); + WARN_ON_ONCE(idxold2 & ~(RCUTORTURE_RDR_ALLBITS | RCUTORTURE_RDR_UPDOWN)); rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq); rtrsp->rt_readstate = newstate; @@ -2070,6 +2129,11 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, if (lockit) raw_spin_unlock_irqrestore(¤t->pi_lock, flags); } + if (statesold & RCUTORTURE_RDR_UPDOWN) { + cur_ops->up_read((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1); + WARN_ON_ONCE(idxnew1 != -1); + idxold1 = 0; + } /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) @@ -2210,7 +2274,8 @@ static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp rtorsp->started = cur_ops->get_gp_seq(); rtorsp->ts = rcu_trace_clock_local(); rtorsp->p = rcu_dereference_check(rcu_torture_current, - !cur_ops->readlock_held || cur_ops->readlock_held()); + !cur_ops->readlock_held || cur_ops->readlock_held() || + (rtorsp->readstate & RCUTORTURE_RDR_UPDOWN)); if (rtorsp->p == NULL) { /* Wait for rcu_torture_writer to get underway */ rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp); @@ -2379,6 +2444,121 @@ rcu_torture_reader(void *arg) return 0; } +struct rcu_torture_one_read_state_updown { + struct hrtimer rtorsu_hrt; + bool rtorsu_inuse; + struct torture_random_state rtorsu_trs; + struct rcu_torture_one_read_state rtorsu_rtors; +}; + +static struct rcu_torture_one_read_state_updown *updownreaders; +static DEFINE_TORTURE_RANDOM(rcu_torture_updown_rand); +static int rcu_torture_updown(void *arg); + +static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp) +{ + struct rcu_torture_one_read_state_updown *rtorsup; + + rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt); + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); + smp_store_release(&rtorsup->rtorsu_inuse, false); + return HRTIMER_NORESTART; +} + +static int rcu_torture_updown_init(void) +{ + int i; + struct torture_random_state *rand = &rcu_torture_updown_rand; + int ret; + + if (n_up_down < 0) + return 0; + if (!srcu_torture_have_up_down()) { + VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Disabling up/down reader tests due to lack of primitives"); + return 0; + } + updownreaders = kcalloc(n_up_down, sizeof(*updownreaders), GFP_KERNEL); + if (!updownreaders) { + VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Out of memory, disabling up/down reader tests"); + return -ENOMEM; + } + for (i = 0; i < n_up_down; i++) { + init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, rand); + hrtimer_setup(&updownreaders[i].rtorsu_hrt, rcu_torture_updown_hrt, CLOCK_MONOTONIC, + HRTIMER_MODE_REL | HRTIMER_MODE_SOFT); + torture_random_init(&updownreaders[i].rtorsu_trs); + init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, + &updownreaders[i].rtorsu_trs); + } + ret = torture_create_kthread(rcu_torture_updown, rand, updown_task); + if (ret) { + kfree(updownreaders); + updownreaders = NULL; + } + return ret; +} + +static void rcu_torture_updown_cleanup(void) +{ + struct rcu_torture_one_read_state_updown *rtorsup; + + for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { + if (!smp_load_acquire(&rtorsup->rtorsu_inuse)) + continue; + if (!hrtimer_cancel(&rtorsup->rtorsu_hrt)) + WARN_ON_ONCE(rtorsup->rtorsu_inuse); + + } + kfree(updownreaders); + updownreaders = NULL; +} + +/* + * RCU torture up/down reader kthread, starting RCU readers in kthread + * context and ending them in hrtimer handlers. Otherwise similar to + * rcu_torture_reader(). + */ +static int +rcu_torture_updown(void *arg) +{ + int idx; + int rawidx; + struct rcu_torture_one_read_state_updown *rtorsup; + ktime_t t; + + VERBOSE_TOROUT_STRING("rcu_torture_updown task started"); + do { + for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { + if (torture_must_stop()) + break; + if (smp_load_acquire(&rtorsup->rtorsu_inuse)) + continue; + init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, + &rtorsup->rtorsu_trs); + rawidx = cur_ops->down_read(); + idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; + rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; + rtorsup->rtorsu_rtors.rtrsp++; + if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, + &rtorsup->rtorsu_trs, -1)) { + schedule_timeout_idle(HZ); + continue; + } + smp_store_release(&rtorsup->rtorsu_inuse, true); + t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million. + if (t < 10 * 1000) + t = 200 * 1000 * 1000; + hrtimer_start(&rtorsup->rtorsu_hrt, t, + HRTIMER_MODE_REL | HRTIMER_MODE_SOFT); + } + torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand); + stutter_wait("rcu_torture_updown"); + } while (!torture_must_stop()); + rcu_torture_updown_cleanup(); + torture_kthread_stopping("rcu_torture_updown"); + return 0; +} + /* * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to * increase race probabilities and fuzzes the interval between toggling. @@ -2633,7 +2813,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "reader_flavor=%x " "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d " - "preempt_duration=%d preempt_interval=%d\n", + "preempt_duration=%d preempt_interval=%d n_up_down=%d\n", torture_type, tag, nrealreaders, nrealfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -2647,7 +2827,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) reader_flavor, nocbs_nthreads, nocbs_toggle, test_nmis, - preempt_duration, preempt_interval); + preempt_duration, preempt_interval, n_up_down); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -3750,6 +3930,10 @@ rcu_torture_cleanup(void) nocb_tasks = NULL; } + if (updown_task) { + torture_stop_kthread(rcu_torture_updown, updown_task); + updown_task = NULL; + } if (reader_tasks) { for (i = 0; i < nrealreaders; i++) torture_stop_kthread(rcu_torture_reader, @@ -4284,6 +4468,9 @@ rcu_torture_init(void) if (torture_init_error(firsterr)) goto unwind; + firsterr = rcu_torture_updown_init(); + if (torture_init_error(firsterr)) + goto unwind; nrealnocbers = nocbs_nthreads; if (WARN_ON(nrealnocbers < 0)) nrealnocbers = 1; From 065de24265155fe662aa9f69a49e0697a5b3b4b5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 9 Feb 2025 04:58:36 -0800 Subject: [PATCH 07/49] rcutorture: Pull rcu_torture_updown() loop body into new function This is strictly a code-movement commit, pulling that part of the rcu_torture_updown() function's loop body that processes one rcu_torture_one_read_state_updown structure into a new rcu_torture_updown_one() function. The checks for the end of the torture test and the current structure being in use remain in the rcu_torture_updown() function. Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 44 ++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bf87c0c4e77e..13cf0e472868 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2513,6 +2513,29 @@ static void rcu_torture_updown_cleanup(void) updownreaders = NULL; } +// Do one reader for rcu_torture_updown(). +static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rtorsup) +{ + int idx; + int rawidx; + ktime_t t; + + init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + rawidx = cur_ops->down_read(); + idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; + rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; + rtorsup->rtorsu_rtors.rtrsp++; + if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) { + schedule_timeout_idle(HZ); + return; + } + smp_store_release(&rtorsup->rtorsu_inuse, true); + t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million. + if (t < 10 * 1000) + t = 200 * 1000 * 1000; + hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_SOFT); +} + /* * RCU torture up/down reader kthread, starting RCU readers in kthread * context and ending them in hrtimer handlers. Otherwise similar to @@ -2521,10 +2544,7 @@ static void rcu_torture_updown_cleanup(void) static int rcu_torture_updown(void *arg) { - int idx; - int rawidx; struct rcu_torture_one_read_state_updown *rtorsup; - ktime_t t; VERBOSE_TOROUT_STRING("rcu_torture_updown task started"); do { @@ -2533,23 +2553,7 @@ rcu_torture_updown(void *arg) break; if (smp_load_acquire(&rtorsup->rtorsu_inuse)) continue; - init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, - &rtorsup->rtorsu_trs); - rawidx = cur_ops->down_read(); - idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; - rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; - rtorsup->rtorsu_rtors.rtrsp++; - if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, - &rtorsup->rtorsu_trs, -1)) { - schedule_timeout_idle(HZ); - continue; - } - smp_store_release(&rtorsup->rtorsu_inuse, true); - t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million. - if (t < 10 * 1000) - t = 200 * 1000 * 1000; - hrtimer_start(&rtorsup->rtorsu_hrt, t, - HRTIMER_MODE_REL | HRTIMER_MODE_SOFT); + rcu_torture_updown_one(rtorsup); } torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand); stutter_wait("rcu_torture_updown"); From 62d92c9b87db43cc255411fe2827b9de42fa0523 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Feb 2025 06:59:24 -0800 Subject: [PATCH 08/49] rcutorture: Complain if an ->up_read() is delayed more than 10 seconds The down/up SRCU reader testing uses an hrtimer handler to exit the SRCU read-side critical section. This might be delayed, and if delayed for too long, it can prevent the rcutorture run from completing. This commit therefore complains if the hrtimer handler is delayed for more than ten seconds. [ paulmck, joel: Apply kernel test robot feedback to avoid false-positive complaint of excessive ->up_read() delays by using HRTIMER_MODE_HARD ] Tested-by: kernel test robot Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 13cf0e472868..22288efc8c67 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2447,6 +2447,8 @@ rcu_torture_reader(void *arg) struct rcu_torture_one_read_state_updown { struct hrtimer rtorsu_hrt; bool rtorsu_inuse; + ktime_t rtorsu_kt; + unsigned long rtorsu_j; struct torture_random_state rtorsu_trs; struct rcu_torture_one_read_state rtorsu_rtors; }; @@ -2485,7 +2487,7 @@ static int rcu_torture_updown_init(void) for (i = 0; i < n_up_down; i++) { init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, rand); hrtimer_setup(&updownreaders[i].rtorsu_hrt, rcu_torture_updown_hrt, CLOCK_MONOTONIC, - HRTIMER_MODE_REL | HRTIMER_MODE_SOFT); + HRTIMER_MODE_REL | HRTIMER_MODE_HARD); torture_random_init(&updownreaders[i].rtorsu_trs); init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, &updownreaders[i].rtorsu_trs); @@ -2533,7 +2535,10 @@ static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rto t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million. if (t < 10 * 1000) t = 200 * 1000 * 1000; - hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_SOFT); + hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_HARD); + smp_mb(); // Sample jiffies after posting hrtimer. + rtorsup->rtorsu_j = jiffies; // Not used by hrtimer handler. + rtorsup->rtorsu_kt = t; } /* @@ -2544,6 +2549,7 @@ static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rto static int rcu_torture_updown(void *arg) { + unsigned long j; struct rcu_torture_one_read_state_updown *rtorsup; VERBOSE_TOROUT_STRING("rcu_torture_updown task started"); @@ -2551,8 +2557,12 @@ rcu_torture_updown(void *arg) for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { if (torture_must_stop()) break; - if (smp_load_acquire(&rtorsup->rtorsu_inuse)) + j = smp_load_acquire(&jiffies); // Time before ->rtorsu_inuse. + if (smp_load_acquire(&rtorsup->rtorsu_inuse)) { + WARN_ONCE(time_after(j, rtorsup->rtorsu_j + 1 + HZ * 10), + "hrtimer queued at jiffies %lu for %lld ns took %lu jiffies\n", rtorsup->rtorsu_j, rtorsup->rtorsu_kt, j - rtorsup->rtorsu_j); continue; + } rcu_torture_updown_one(rtorsup); } torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand); From 93856948be8f5d784a28cfb2eb16884a5522faaa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Feb 2025 09:29:35 -0800 Subject: [PATCH 09/49] rcutorture: Check for ->up_read() without matching ->down_read() This commit creates counters in the rcu_torture_one_read_state_updown structure that check for a call to ->up_read() that lacks a matching call to ->down_read(). While in the area, add end-of-run cleanup code that prevents calls to rcu_torture_updown_hrt() from happening after the test has moved on. Yes, the srcu_barrier() at the end of the test will wait for them, but this could result in confusing states, statistics, and diagnostic information. So explicitly wait for them before we get to the end-of-test output. [ paulmck: Apply kernel test robot feedback. ] [ joel: Apply Boqun's fix for counter increment ordering. ] Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 22288efc8c67..f773d4f8f2ae 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2449,6 +2449,8 @@ struct rcu_torture_one_read_state_updown { bool rtorsu_inuse; ktime_t rtorsu_kt; unsigned long rtorsu_j; + unsigned long rtorsu_ndowns; + unsigned long rtorsu_nups; struct torture_random_state rtorsu_trs; struct rcu_torture_one_read_state rtorsu_rtors; }; @@ -2463,6 +2465,8 @@ static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp) rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt); rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + rtorsup->rtorsu_nups++; smp_store_release(&rtorsup->rtorsu_inuse, false); return HRTIMER_NORESTART; } @@ -2507,8 +2511,12 @@ static void rcu_torture_updown_cleanup(void) for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { if (!smp_load_acquire(&rtorsup->rtorsu_inuse)) continue; - if (!hrtimer_cancel(&rtorsup->rtorsu_hrt)) - WARN_ON_ONCE(rtorsup->rtorsu_inuse); + if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) { + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + rtorsup->rtorsu_nups++; + smp_store_release(&rtorsup->rtorsu_inuse, false); + } } kfree(updownreaders); @@ -2524,10 +2532,13 @@ static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rto init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); rawidx = cur_ops->down_read(); + rtorsup->rtorsu_ndowns++; idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; rtorsup->rtorsu_rtors.rtrsp++; if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) { + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + rtorsup->rtorsu_nups++; schedule_timeout_idle(HZ); return; } From f6c8785f50443db4c70faebfc22e59e8064c35a5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 May 2025 16:45:00 -0700 Subject: [PATCH 10/49] rcutorture: Check for no up/down readers at task level The design of testing of up/down readers such as srcu_down_read() and srcu_up_read() assumes that these are tested only by the rcu_torture_updown() kthread, and never by the rcu_torture_reader() kthread. Because we all know which road is paved with good intentions, this commit adds WARN_ON_ONCE() to verify that things are going to plan. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f773d4f8f2ae..10f3cc4861ee 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2232,6 +2232,7 @@ rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_sta i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); + WARN_ON_ONCE(mask & RCUTORTURE_RDR_UPDOWN); rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]); } return &rtrsp[j]; @@ -2368,6 +2369,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) WARN_ON_ONCE(!rcu_is_watching()); init_rcu_torture_one_read_state(&rtors, trsp); newstate = rcutorture_extend_mask(rtors.readstate, trsp); + WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN); rcutorture_one_extend(&rtors.readstate, newstate, myid < 0, trsp, rtors.rtrsp++); if (!rcu_torture_one_read_start(&rtors, trsp, myid)) { rcutorture_one_extend(&rtors.readstate, 0, myid < 0, trsp, rtors.rtrsp); From cacba0bf6d9f7568e115f3ffe32a95c33f5fd885 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Apr 2025 11:22:49 -0700 Subject: [PATCH 11/49] rcutorture: Print number of RCU up/down readers and migrations This commit prints the number of RCU up/down readers and the number of such readers that migrated from one CPU to another, along with the rest of the periodic rcu_torture_stats_print() output. These statistics are currently used only by srcu_down_read{,_fast}() and srcu_up_read(,_fast)(). Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 10f3cc4861ee..3f644b1f5823 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2450,9 +2450,11 @@ struct rcu_torture_one_read_state_updown { struct hrtimer rtorsu_hrt; bool rtorsu_inuse; ktime_t rtorsu_kt; + int rtorsu_cpu; unsigned long rtorsu_j; unsigned long rtorsu_ndowns; unsigned long rtorsu_nups; + unsigned long rtorsu_nmigrates; struct torture_random_state rtorsu_trs; struct rcu_torture_one_read_state rtorsu_rtors; }; @@ -2463,12 +2465,15 @@ static int rcu_torture_updown(void *arg); static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp) { + int cpu = raw_smp_processor_id(); struct rcu_torture_one_read_state_updown *rtorsup; rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt); rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); - rtorsup->rtorsu_nups++; + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + WRITE_ONCE(rtorsup->rtorsu_nmigrates, + rtorsup->rtorsu_nmigrates + (cpu != rtorsup->rtorsu_cpu)); smp_store_release(&rtorsup->rtorsu_inuse, false); return HRTIMER_NORESTART; } @@ -2516,7 +2521,7 @@ static void rcu_torture_updown_cleanup(void) if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) { rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); - rtorsup->rtorsu_nups++; + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); smp_store_release(&rtorsup->rtorsu_inuse, false); } @@ -2534,13 +2539,14 @@ static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rto init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); rawidx = cur_ops->down_read(); - rtorsup->rtorsu_ndowns++; + WRITE_ONCE(rtorsup->rtorsu_ndowns, rtorsup->rtorsu_ndowns + 1); idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; rtorsup->rtorsu_rtors.rtrsp++; + rtorsup->rtorsu_cpu = raw_smp_processor_id(); if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) { WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); - rtorsup->rtorsu_nups++; + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); schedule_timeout_idle(HZ); return; } @@ -2649,6 +2655,10 @@ rcu_torture_stats_print(void) long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long n_gpwraps = 0; + unsigned long ndowns = 0; + unsigned long nunexpired = 0; + unsigned long nmigrates = 0; + unsigned long nups = 0; struct rcu_torture *rtcp; static unsigned long rtcv_snap = ULONG_MAX; static bool splatted; @@ -2662,10 +2672,18 @@ rcu_torture_stats_print(void) if (cur_ops->get_gpwrap_count) n_gpwraps += cur_ops->get_gpwrap_count(cpu); } + if (updownreaders) { + for (i = 0; i < n_up_down; i++) { + ndowns += READ_ONCE(updownreaders[i].rtorsu_ndowns); + nups += READ_ONCE(updownreaders[i].rtorsu_nups); + nunexpired += READ_ONCE(updownreaders[i].rtorsu_inuse); + nmigrates += READ_ONCE(updownreaders[i].rtorsu_nmigrates); + } + } for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) { if (pipesummary[i] != 0) break; - } + } // The value of variable "i" is used later, so don't clobber it! pr_alert("%s%s ", torture_type, TORTURE_FLAG); rtcp = rcu_access_pointer(rcu_torture_current); @@ -2686,6 +2704,8 @@ rcu_torture_stats_print(void) n_rcu_torture_boost_failure, n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); + if (updownreaders) + pr_cont("ndowns: %lu nups: %lu nhrt: %lu nmigrates: %lu ", ndowns, nups, nunexpired, nmigrates); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld ", data_race(n_barrier_successes), From f32367d96eba8fc50a0bbd2a792df3199089d13b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 May 2025 21:09:05 -0700 Subject: [PATCH 12/49] rcutorture: Drop redundant "insoftirq" parameters Given that the rcutorture_one_extend_check() function now uses in_serving_softirq() and in_hardirq(), it is no longer necessary to pass insoftirq flags down the function-call stack. This commit therefore removes those flags, and, while in the area, does a bit of whitespace cleanup. Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3f644b1f5823..470b5a117602 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1975,7 +1975,7 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, // Verify the specified RCUTORTURE_RDR* state. #define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() -static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq) +static void rcutorture_one_extend_check(char *s, int curstate, int new, int old) { int mask; @@ -2038,8 +2038,7 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, * beginning or end of the critical section and if there was actually a * change, do a ->read_delay(). */ -static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, - struct torture_random_state *trsp, +static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { bool first; @@ -2054,7 +2053,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, first = idxold1 == 0; WARN_ON_ONCE(idxold2 < 0); WARN_ON_ONCE(idxold2 & ~(RCUTORTURE_RDR_ALLBITS | RCUTORTURE_RDR_UPDOWN)); - rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq); + rcutorture_one_extend_check("before change", idxold1, statesnew, statesold); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -2074,8 +2073,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2; // Complain unless both the old and the new protection is in place. - rcutorture_one_extend_check("during change", - idxold1 | statesnew, statesnew, statesold, insoftirq); + rcutorture_one_extend_check("during change", idxold1 | statesnew, statesnew, statesold); // Sample CPU under both sets of protections to reduce confusion. if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { @@ -2150,7 +2148,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, WARN_ON_ONCE(*readstate < 0); if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS)) pr_info("Unexpected readstate value of %#x\n", *readstate); - rcutorture_one_extend_check("after change", *readstate, statesnew, statesold, insoftirq); + rcutorture_one_extend_check("after change", *readstate, statesnew, statesold); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -2217,8 +2215,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) * critical section. */ static struct rt_read_seg * -rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_state *trsp, - struct rt_read_seg *rtrsp) +rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { int i; int j; @@ -2233,7 +2230,7 @@ rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_sta for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); WARN_ON_ONCE(mask & RCUTORTURE_RDR_UPDOWN); - rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]); + rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]); } return &rtrsp[j]; } @@ -2279,7 +2276,7 @@ static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp (rtorsp->readstate & RCUTORTURE_RDR_UPDOWN)); if (rtorsp->p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp); + rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp); return false; } if (rtorsp->p->rtort_mbtest == 0) @@ -2293,7 +2290,7 @@ static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp * critical sections and check for errors. */ static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp, - struct torture_random_state *trsp, long myid) + struct torture_random_state *trsp) { int i; unsigned long completed; @@ -2340,7 +2337,7 @@ static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp, } if (cur_ops->reader_blocked) preempted = cur_ops->reader_blocked(); - rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp); + rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp); WARN_ON_ONCE(rtorsp->readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. @@ -2370,13 +2367,13 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) init_rcu_torture_one_read_state(&rtors, trsp); newstate = rcutorture_extend_mask(rtors.readstate, trsp); WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN); - rcutorture_one_extend(&rtors.readstate, newstate, myid < 0, trsp, rtors.rtrsp++); + rcutorture_one_extend(&rtors.readstate, newstate, trsp, rtors.rtrsp++); if (!rcu_torture_one_read_start(&rtors, trsp, myid)) { - rcutorture_one_extend(&rtors.readstate, 0, myid < 0, trsp, rtors.rtrsp); + rcutorture_one_extend(&rtors.readstate, 0, trsp, rtors.rtrsp); return false; } - rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, myid < 0, trsp, rtors.rtrsp); - rcu_torture_one_read_end(&rtors, trsp, myid); + rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, trsp, rtors.rtrsp); + rcu_torture_one_read_end(&rtors, trsp); return true; } @@ -2469,7 +2466,7 @@ static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp) struct rcu_torture_one_read_state_updown *rtorsup; rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt); - rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); WRITE_ONCE(rtorsup->rtorsu_nmigrates, @@ -2519,7 +2516,7 @@ static void rcu_torture_updown_cleanup(void) if (!smp_load_acquire(&rtorsup->rtorsu_inuse)) continue; if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) { - rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); smp_store_release(&rtorsup->rtorsu_inuse, false); From 5f2417ba05541332f334b557febc3af355317f2b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 16 May 2025 14:20:46 -0700 Subject: [PATCH 13/49] rcutorture: Make Trivial RCU ignore onoff_interval and shuffle_interval Trivial RCU is a textbook implementation that is not used in the Linux kernel, but tested to keep textbooks (and presentations) honest. It is so trivial that it cannot deal with either CPU hotplug or external migration from one CPU to another. This commit therefore splats whenever onoff_interval or shuffle_interval are non-zero, and then sets them to zero in order to avoid false-positive failures. Those wishing to set these module parameters in order to force failures in Trivial RCU are free to revert this commit. Just don't expect me to be sympathetic to any resulting bug reports! Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202505131651.af6e81d7-lkp@intel.com Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 470b5a117602..7a07b2590c27 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -969,7 +969,8 @@ static struct rcu_torture_ops busted_srcud_ops = { /* * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. - * This implementation does not necessarily work well with CPU hotplug. + * This implementation does not work well with CPU hotplug nor + * with rcutorture's shuffling. */ static void synchronize_rcu_trivial(void) @@ -982,6 +983,16 @@ static void synchronize_rcu_trivial(void) } } +static void rcu_sync_torture_init_trivial(void) +{ + rcu_sync_torture_init(); + // if (onoff_interval || shuffle_interval) { + if (WARN_ONCE(onoff_interval || shuffle_interval, "%s: Non-zero onoff_interval (%d) or shuffle_interval (%d) breaks trivial RCU, resetting to zero", __func__, onoff_interval, shuffle_interval)) { + onoff_interval = 0; + shuffle_interval = 0; + } +} + static int rcu_torture_read_lock_trivial(void) { preempt_disable(); @@ -995,7 +1006,7 @@ static void rcu_torture_read_unlock_trivial(int idx) static struct rcu_torture_ops trivial_ops = { .ttype = RCU_TRIVIAL_FLAVOR, - .init = rcu_sync_torture_init, + .init = rcu_sync_torture_init_trivial, .readlock = rcu_torture_read_lock_trivial, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock_trivial, From 8d71351d88e478d3c4e945e3218e97ec677fd807 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 7 May 2025 19:26:03 +0800 Subject: [PATCH 14/49] rcutorture: Fix rcutorture_one_extend_check() splat in RT kernels For built with CONFIG_PREEMPT_RT=y kernels, running rcutorture tests resulted in the following splat: [ 68.797425] rcutorture_one_extend_check during change: Current 0x1 To add 0x1 To remove 0x0 preempt_count() 0x0 [ 68.797533] WARNING: CPU: 2 PID: 512 at kernel/rcu/rcutorture.c:1993 rcutorture_one_extend_check+0x419/0x560 [rcutorture] [ 68.797601] Call Trace: [ 68.797602] [ 68.797619] ? lockdep_softirqs_off+0xa5/0x160 [ 68.797631] rcutorture_one_extend+0x18e/0xcc0 [rcutorture 2466dbd2ff34dbaa36049cb323a80c3306ac997c] [ 68.797646] ? local_clock+0x19/0x40 [ 68.797659] rcu_torture_one_read+0xf0/0x280 [rcutorture 2466dbd2ff34dbaa36049cb323a80c3306ac997c] [ 68.797678] ? __pfx_rcu_torture_one_read+0x10/0x10 [rcutorture 2466dbd2ff34dbaa36049cb323a80c3306ac997c] [ 68.797804] ? __pfx_rcu_torture_timer+0x10/0x10 [rcutorture 2466dbd2ff34dbaa36049cb323a80c3306ac997c] [ 68.797815] rcu-torture: rcu_torture_reader task started [ 68.797824] rcu-torture: Creating rcu_torture_reader task [ 68.797824] rcu_torture_reader+0x238/0x580 [rcutorture 2466dbd2ff34dbaa36049cb323a80c3306ac997c] [ 68.797836] ? kvm_sched_clock_read+0x15/0x30 Disable BH does not change the SOFTIRQ corresponding bits in preempt_count() for RT kernels, this commit therefore use softirq_count() to check the if BH is disabled. Reviewed-by: Paul E. McKenney Signed-off-by: Zqiang Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7a07b2590c27..213f23f20a64 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -471,7 +471,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); - if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK)) + if ((preempt_count() & HARDIRQ_MASK) || softirq_count()) longdelay_ms = 5; /* Avoid triggering BH limits. */ mdelay(longdelay_ms); rtrsp->rt_delay_ms = longdelay_ms; @@ -2001,7 +2001,7 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old) return; WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && - !(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + !softirq_count(), ROEC_ARGS); WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && !(preempt_count() & PREEMPT_MASK), ROEC_ARGS); WARN_ONCE(cur_ops->readlock_nesting && @@ -2015,7 +2015,7 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old) WARN_ONCE(cur_ops->extendables && !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && - (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + softirq_count(), ROEC_ARGS); /* * non-preemptible RCU in a preemptible kernel uses preempt_disable() @@ -2036,6 +2036,9 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old) if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + if (IS_ENABLED(CONFIG_PREEMPT_RT) && softirq_count()) + mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; + WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) && cur_ops->readlock_nesting() > 0, ROEC_ARGS); } From 3b16e77e0706eb6cdfc850956ee1ed5e5c36c6ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Jun 2025 06:29:19 -0700 Subject: [PATCH 15/49] rcutorture: Make BUSTED scenario check and log readers Because the BUSTED scenario intentionally executes too-short readers, this commit enables the RCU_TORTURE_TEST_CHK_RDR_STATE, RCU_TORTURE_TEST_LOG_CPU, and RCU_TORTURE_TEST_LOG_GP Kconfig options to test the resulting reader-segment dump. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- tools/testing/selftests/rcutorture/configs/rcu/BUSTED | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED index 48d8a245c7fa..7d75f4b94943 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED +++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED @@ -5,3 +5,6 @@ CONFIG_HOTPLUG_CPU=y CONFIG_PREEMPT_NONE=n CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=y +CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE=y +CONFIG_RCU_TORTURE_TEST_LOG_CPU=y +CONFIG_RCU_TORTURE_TEST_LOG_GP=y From e40e2391388d8db623fd2ac92d7750648155e9d3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 May 2025 16:44:58 -0700 Subject: [PATCH 16/49] torture: Suppress torture.sh "Zero time" messages for disabled tests The torture.sh script prints " --- Zero time for locktorture, disabling" when the --duration parameter is too short to allow the test to run even when locktorture has been disabled, for example, via --do-none. The same is true for scftorture and rcutorture. This commit therefore suppresses this message when the corresponding test has been disabled. Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- tools/testing/selftests/rcutorture/bin/torture.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index e03fdaca89b3..c518de296871 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -274,7 +274,7 @@ then configs_rcutorture=CFLIST fi duration_rcutorture=$((duration_base*duration_rcutorture_frac/10)) -if test "$duration_rcutorture" -eq 0 +if test "$duration_rcutorture" -eq 0 && test "$do_locktorture" = "yes" then echo " --- Zero time for rcutorture, disabling" | tee -a $T/log do_rcutorture=no @@ -286,7 +286,7 @@ then configs_locktorture=CFLIST fi duration_locktorture=$((duration_base*duration_locktorture_frac/10)) -if test "$duration_locktorture" -eq 0 +if test "$duration_locktorture" -eq 0 && test "$do_locktorture" = "yes" then echo " --- Zero time for locktorture, disabling" | tee -a $T/log do_locktorture=no @@ -298,7 +298,7 @@ then configs_scftorture=CFLIST fi duration_scftorture=$((duration_base*duration_scftorture_frac/10)) -if test "$duration_scftorture" -eq 0 +if test "$duration_scftorture" -eq 0 && test "$do_scftorture" = "yes" then echo " --- Zero time for scftorture, disabling" | tee -a $T/log do_scftorture=no From 4176ebdf97d1ad08e205ec97a8f5b66f9efa70bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 May 2025 15:25:42 -0700 Subject: [PATCH 17/49] torture: Permit multiple space characters in kvm.sh --kconfig argument The straightforward way of doing bash substitution for optional strings leaves a pair of space characters, which the kvm.sh --kconfig option rejects as ill-formed. This commit therefore changes the corresponding regular expression to accommodate more than one space character between successive Kconfig options. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 42e5e8597a1a..9c1b850b3227 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -199,7 +199,7 @@ do fi ;; --kconfig|--kconfigs) - checkarg --kconfig "(Kconfig options)" $# "$2" '^\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\( \(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\)*$' '^error$' + checkarg --kconfig "(Kconfig options)" $# "$2" '^\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\( \+\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\)* *$' '^error$' TORTURE_KCONFIG_ARG="`echo "$TORTURE_KCONFIG_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`" shift ;; From 955a83469cb46a1bc4339425e5c9cb97fc6303a0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 May 2025 10:20:08 -0700 Subject: [PATCH 18/49] torture: Make torture.sh KCSAN runs set CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE=y The RCU_TORTURE_TEST_CHK_RDR_STATE Kconfig option is used for low-level debugging of rcutorture's generation of overlapping and nested RCU readers. It incurs significant overhead, and is thus not to be used lightly. But if it is not tested regularly, it won't be there when it is needed, for example, it would have found an rcutorture bug in the testing of srcu_up_read(). This commit therefore uses CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE=y when building KCSAN kernels, but only for the --do-rcutorture case. Reported-by: kernel test robot Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- tools/testing/selftests/rcutorture/bin/torture.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index c518de296871..53f61f278fd7 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -378,7 +378,12 @@ function torture_set { kcsan_kmake_tag="--kmake-args" cur_kcsan_kmake_args="$kcsan_kmake_args" fi - torture_one "$@" --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan + chk_rdr_state= + if test "${flavor}" = rcutorture + then + chk_rdr_state="CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE=y" + fi + torture_one "$@" --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y ${chk_rdr_state}" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan mv $T/last-resdir $T/last-resdir-kcsan || : fi } From 1524f2032aad74f604a54d5a5cbb8b214f4da41c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 May 2025 13:38:56 -0700 Subject: [PATCH 19/49] torture: Default --no-rcutasksflavors on arm64 Because arm64 does not support CONFIG_SMP=n kernels, --do-rcutasksflavors gets Kconfig errors when running the TINY01 rcutorture scenario. This commit therefore makes --no-rcutasksflavors be the default on arm64. Once kvm.sh automatically deselects CONFIG_SMP=n rcutorture scenarios on arm64, the two lines marked "FIXME" can be changed back from "${ifnotaarch64}" to "yes". Note that arm64 users can still specify --do-rcutasksflavors in order to override this default. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- tools/testing/selftests/rcutorture/bin/torture.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 53f61f278fd7..584d57b48036 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -30,6 +30,15 @@ then VERBOSE_BATCH_CPUS=0 fi +# Machine architecture? ("uname -p" is said to be less portable.)1 +thisarch="`uname -m`" +if test "${thisarch}" = aarch64 +then + ifnotaarch64=no +else + ifnotaarch64=yes +fi + # Configurations/scenarios. configs_rcutorture= configs_locktorture= @@ -57,7 +66,7 @@ do_kasan=yes do_kcsan=no do_clocksourcewd=yes do_rt=yes -do_rcutasksflavors=yes +do_rcutasksflavors="${ifnotaarch64}" # FIXME: Back to "yes" when SMP=n auto-avoided do_srcu_lockdep=yes do_rcu_rust=no @@ -124,7 +133,7 @@ do ;; --do-all|--doall) do_allmodconfig=yes - do_rcutasksflavor=yes + do_rcutasksflavors="${ifnotaarch64}" # FIXME: Back to "yes" when SMP=n auto-avoided do_rcutorture=yes do_locktorture=yes do_scftorture=yes From 103d567f51e0eaee441bb26769e443a719299412 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 May 2025 13:54:56 -0700 Subject: [PATCH 20/49] torture: Default --no-clocksourcewd on arm64 Because arm64 does not support CONFIG_CLOCKSOURCE_WATCHDOG=n kernels, --do-clocksourcewd gets Kconfig errors. This commit therefore makes --do-no-clocksourcewd be the default on arm64. Note that arm64 users can still specify --do-clocksourcewd in order to override this default. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- tools/testing/selftests/rcutorture/bin/torture.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 584d57b48036..25847042e30e 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -64,7 +64,7 @@ do_normal=yes explicit_normal=no do_kasan=yes do_kcsan=no -do_clocksourcewd=yes +do_clocksourcewd="${ifnotaarch64}" do_rt=yes do_rcutasksflavors="${ifnotaarch64}" # FIXME: Back to "yes" when SMP=n auto-avoided do_srcu_lockdep=yes @@ -145,7 +145,7 @@ do explicit_normal=no do_kasan=yes do_kcsan=yes - do_clocksourcewd=yes + do_clocksourcewd="${ifnotaarch64}" do_srcu_lockdep=yes ;; --do-allmodconfig|--do-no-allmodconfig|--no-allmodconfig) From a33ad03aaed2c39d29a27a64ee55ff919810de59 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 18 Mar 2025 10:23:36 +0100 Subject: [PATCH 21/49] rcu/nocb: Dump gp state even if rdp gp itself is not offloaded When a stall is detected, the state of each NOCB CPU is dumped along with the state of each NOCB group. The latter part however is incidentally ignored if the NOCB group leader happens not to be offloaded itself. Fix this to make sure related precious informations aren't lost over a stall report. Reported-by: "Paul E. McKenney" Signed-off-by: Frederic Weisbecker Reviewed-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_nocb.h | 3 +++ kernel/rcu/tree_stall.h | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index b473ff056f49..cb29b6bb0ed4 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1564,6 +1564,9 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) if (rdp->nocb_gp_rdp == rdp) show_rcu_nocb_gp_state(rdp); + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + return; + nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp, &rdp->nocb_entry_rdp, typeof(*rdp), diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 486c00536207..4fa64c959083 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -953,8 +953,7 @@ void show_rcu_gp_kthreads(void) for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); cbs += data_race(READ_ONCE(rdp->n_cbs_invoked)); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) - show_rcu_nocb_state(rdp); + show_rcu_nocb_state(rdp); } pr_info("RCU callbacks invoked since boot: %lu\n", cbs); show_rcu_tasks_gp_kthreads(); From 005b6187705bc9723518ce19c5cb911fc1f7ef07 Mon Sep 17 00:00:00 2001 From: Artem Sadovnikov Date: Sun, 29 Jun 2025 23:12:12 +0000 Subject: [PATCH 22/49] refscale: Check that nreaders and loops multiplication doesn't overflow The nreaders and loops variables are exposed as module parameters, which, in certain combinations, can lead to multiplication overflow. Besides, loops parameter is defined as long, while through the code is used as int, which can cause truncation on 64-bit kernels and possible zeroes where they shouldn't appear. Since code uses result of multiplication as int anyway, it only makes sense to replace loops with int. Multiplication overflow check is also added due to possible multiplication between two very big numbers. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 653ed64b01dc ("refperf: Add a test to measure performance of read-side synchronization") Signed-off-by: Artem Sadovnikov Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/refscale.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index f11a7c2af778..ab7fcdc94cc0 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -85,7 +85,7 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, // Number of typesafe_lookup structures, that is, the degree of concurrency. torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures."); // Number of loops per experiment, all readers execute operations concurrently. -torture_param(long, loops, 10000, "Number of loops per experiment."); +torture_param(int, loops, 10000, "Number of loops per experiment."); // Number of readers, with -1 defaulting to about 75% of the CPUs. torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. @@ -1140,7 +1140,7 @@ static void ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay); } @@ -1238,12 +1238,16 @@ ref_scale_init(void) // Reader tasks (default to ~75% of online CPUs). if (nreaders < 0) nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); - if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops)) + if (WARN_ONCE(loops <= 0, "%s: loops = %d, adjusted to 1\n", __func__, loops)) loops = 1; if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders)) nreaders = 1; if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns)) nruns = 1; + if (WARN_ONCE(loops > INT_MAX / nreaders, + "%s: nreaders * loops will overflow, adjusted loops to %d", + __func__, INT_MAX / nreaders)) + loops = INT_MAX / nreaders; reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { From 4b9432ed65cb3a692e8d8bd86abb93f5f2dc5b69 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Apr 2025 15:43:01 +0200 Subject: [PATCH 23/49] rcu/exp: Remove confusing needless full barrier on task unblock A full memory barrier in the RCU-PREEMPT task unblock path advertizes to order the context switch (or rather the accesses prior to rcu_read_unlock()) with the expedited grace period fastpath. However the grace period can not complete without the rnp calling into rcu_report_exp_rnp() with the node locked. This reports the quiescent state in a fully ordered fashion against updater's accesses thanks to: 1) The READ-SIDE smp_mb__after_unlock_lock() barrier across nodes locking while propagating QS up to the root. 2) The UPDATE-SIDE smp_mb__after_unlock_lock() barrier while holding the the root rnp to wait/check for the GP completion. 3) The (perhaps redundant given step 1) and 2)) smp_mb() in rcu_seq_end() before the grace period completes. This makes the explicit barrier in this place superfluous. Therefore remove it as it is confusing. Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_plugin.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0b0f56f6abc8..0532a13cb75e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -534,7 +534,6 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); empty_exp = sync_rcu_exp_done(rnp); - smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; From bf0a57445d3fddfb47046aeccf4f5cb938a4e996 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Apr 2025 15:43:02 +0200 Subject: [PATCH 24/49] rcu/exp: Remove needless CPU up quiescent state report A CPU coming online checks for an ongoing grace period and reports a quiescent state accordingly if needed. This special treatment that shortcuts the expedited IPI finds its origin as an optimization purpose on the following commit: 338b0f760e84 (rcu: Better hotplug handling for synchronize_sched_expedited() The point is to avoid an IPI while waiting for a CPU to become online or failing to become offline. However this is pointless and even error prone for several reasons: * If the CPU has been seen offline in the first round scanning offline and idle CPUs, no IPI is even tried and the quiescent state is reported on behalf of the CPU. * This means that if the IPI fails, the CPU just became offline. So it's unlikely to become online right away, unless the cpu hotplug operation failed and rolled back, which is a rare event that can wait a jiffy for a new IPI to be issued. * But then the "optimization" applying on failing CPU hotplug down only applies to !PREEMPT_RCU. * This force reports a quiescent state even if ->cpu_no_qs.b.exp is not set. As a result it can race with remote QS reports on the same rdp. Fortunately it happens to be OK but an accident is waiting to happen. For all those reasons, remove this optimization that doesn't look worthy to keep around. Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 2 -- kernel/rcu/tree_exp.h | 45 ++----------------------------------------- 2 files changed, 2 insertions(+), 45 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 14d4499c6fc3..0bda23fec690 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -160,7 +160,6 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); -static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); static bool rcu_rdp_cpu_online(struct rcu_data *rdp); @@ -4268,7 +4267,6 @@ int rcutree_online_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ - sync_sched_exp_online_cleanup(cpu); // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 2fa7aa9155bd..6058a734090c 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -751,12 +751,8 @@ static void rcu_exp_handler(void *unused) struct task_struct *t = current; /* - * First, is there no need for a quiescent state from this CPU, - * or is this CPU already looking for a quiescent state for the - * current grace period? If either is the case, just leave. - * However, this should not happen due to the preemptible - * sync_sched_exp_online_cleanup() implementation being a no-op, - * so warn if this does happen. + * WARN if the CPU is unexpectedly already looking for a + * QS or has already reported one. */ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) || @@ -803,11 +799,6 @@ static void rcu_exp_handler(void *unused) WARN_ON_ONCE(1); } -/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ -} - /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each that is blocking the current @@ -885,38 +876,6 @@ static void rcu_exp_handler(void *unused) rcu_exp_need_qs(); } -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - unsigned long flags; - int my_cpu; - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - my_cpu = get_cpu(); - /* Quiescent state either not needed or already requested, leave. */ - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - READ_ONCE(rdp->cpu_no_qs.b.exp)) { - put_cpu(); - return; - } - /* Quiescent state needed on current CPU, so set it up locally. */ - if (my_cpu == cpu) { - local_irq_save(flags); - rcu_exp_need_qs(); - local_irq_restore(flags); - put_cpu(); - return; - } - /* Quiescent state needed on some other CPU, send IPI. */ - ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); - put_cpu(); - WARN_ON_ONCE(ret); -} - /* * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections that are From fc39760cd0f432cd399a209c43dcb887fffdb6dc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Apr 2025 15:43:03 +0200 Subject: [PATCH 25/49] rcu/exp: Warn on QS requested on dying CPU It is not possible to send an IPI to a dying CPU that has passed the CPUHP_TEARDOWN_CPU stage. Remaining unhandled IPIs are handled later at CPUHP_AP_SMPCFD_DYING stage by stop machine. This is the last opportunity for RCU exp handler to request an expedited quiescent state. And the upcoming final context switch between stop machine and idle must have reported the requested context switch. Therefore, it should not be possible to observe a pending requested expedited quiescent state when RCU finally stops watching the outgoing CPU. Once IPIs aren't possible anymore, the QS for the target CPU will be reported on its behalf by the RCU exp kworker. Provide an assertion to verify those expectations. Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0bda23fec690..00c182b3f978 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4356,6 +4356,12 @@ void rcutree_report_cpu_dead(void) * may introduce a new READ-side while it is actually off the QS masks. */ lockdep_assert_irqs_disabled(); + /* + * CPUHP_AP_SMPCFD_DYING was the last call for rcu_exp_handler() execution. + * The requested QS must have been reported on the last context switch + * from stop machine to idle. + */ + WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); // Do any dangling deferred wakeups. do_nocb_deferred_wakeup(rdp); From 1bba3900ca18bdae28d1b9fa10f16a8f8cb2ada1 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 7 May 2025 19:26:05 +0800 Subject: [PATCH 26/49] rcu/nocb: Fix possible invalid rdp's->nocb_cb_kthread pointer access In the preparation stage of CPU online, if the corresponding the rdp's->nocb_cb_kthread does not exist, will be created, there is a situation where the rdp's rcuop kthreads creation fails, and then de-offload this CPU's rdp, does not assign this CPU's rdp->nocb_cb_kthread pointer, but this rdp's->nocb_gp_rdp and rdp's->rdp_gp->nocb_gp_kthread is still valid. This will cause the subsequent re-offload operation of this offline CPU, which will pass the conditional check and the kthread_unpark() will access invalid rdp's->nocb_cb_kthread pointer. This commit therefore use rdp's->nocb_gp_kthread instead of rdp_gp's->nocb_gp_kthread for safety check. Signed-off-by: Zqiang Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_nocb.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index cb29b6bb0ed4..08eb9b0e2fab 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1146,7 +1146,6 @@ static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp) static int rcu_nocb_rdp_offload(struct rcu_data *rdp) { int wake_gp; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; WARN_ON_ONCE(cpu_online(rdp->cpu)); /* @@ -1156,7 +1155,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) if (!rdp->nocb_gp_rdp) return -EINVAL; - if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread)) + if (WARN_ON_ONCE(!rdp->nocb_gp_kthread)) return -EINVAL; pr_info("Offloading %d\n", rdp->cpu); @@ -1166,7 +1165,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) wake_gp = rcu_nocb_queue_toggle_rdp(rdp); if (wake_gp) - wake_up_process(rdp_gp->nocb_gp_kthread); + wake_up_process(rdp->nocb_gp_kthread); swait_event_exclusive(rdp->nocb_state_wq, rcu_nocb_rdp_offload_wait_cond(rdp)); From 90c09d57caeca94e6f3f87c49e96a91edd40cbfd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 Apr 2025 16:49:53 -0700 Subject: [PATCH 27/49] rcu: Protect ->defer_qs_iw_pending from data race On kernels built with CONFIG_IRQ_WORK=y, when rcu_read_unlock() is invoked within an interrupts-disabled region of code [1], it will invoke rcu_read_unlock_special(), which uses an irq-work handler to force the system to notice when the RCU read-side critical section actually ends. That end won't happen until interrupts are enabled at the soonest. In some kernels, such as those booted with rcutree.use_softirq=y, the irq-work handler is used unconditionally. The per-CPU rcu_data structure's ->defer_qs_iw_pending field is updated by the irq-work handler and is both read and updated by rcu_read_unlock_special(). This resulted in the following KCSAN splat: ------------------------------------------------------------------------ BUG: KCSAN: data-race in rcu_preempt_deferred_qs_handler / rcu_read_unlock_special read to 0xffff96b95f42d8d8 of 1 bytes by task 90 on cpu 8: rcu_read_unlock_special+0x175/0x260 __rcu_read_unlock+0x92/0xa0 rt_spin_unlock+0x9b/0xc0 __local_bh_enable+0x10d/0x170 __local_bh_enable_ip+0xfb/0x150 rcu_do_batch+0x595/0xc40 rcu_cpu_kthread+0x4e9/0x830 smpboot_thread_fn+0x24d/0x3b0 kthread+0x3bd/0x410 ret_from_fork+0x35/0x40 ret_from_fork_asm+0x1a/0x30 write to 0xffff96b95f42d8d8 of 1 bytes by task 88 on cpu 8: rcu_preempt_deferred_qs_handler+0x1e/0x30 irq_work_single+0xaf/0x160 run_irq_workd+0x91/0xc0 smpboot_thread_fn+0x24d/0x3b0 kthread+0x3bd/0x410 ret_from_fork+0x35/0x40 ret_from_fork_asm+0x1a/0x30 no locks held by irq_work/8/88. irq event stamp: 200272 hardirqs last enabled at (200272): [] finish_task_switch+0x131/0x320 hardirqs last disabled at (200271): [] __schedule+0x129/0xd70 softirqs last enabled at (0): [] copy_process+0x4df/0x1cc0 softirqs last disabled at (0): [<0000000000000000>] 0x0 ------------------------------------------------------------------------ The problem is that irq-work handlers run with interrupts enabled, which means that rcu_preempt_deferred_qs_handler() could be interrupted, and that interrupt handler might contain an RCU read-side critical section, which might invoke rcu_read_unlock_special(). In the strict KCSAN mode of operation used by RCU, this constitutes a data race on the ->defer_qs_iw_pending field. This commit therefore disables interrupts across the portion of the rcu_preempt_deferred_qs_handler() that updates the ->defer_qs_iw_pending field. This suffices because this handler is not a fast path. Signed-off-by: Paul E. McKenney Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_plugin.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0b0f56f6abc8..a91b2322a0cd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -624,10 +624,13 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t) */ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) { + unsigned long flags; struct rcu_data *rdp; rdp = container_of(iwp, struct rcu_data, defer_qs_iw); + local_irq_save(flags); rdp->defer_qs_iw_pending = false; + local_irq_restore(flags); } /* From 78370df5c3574cdc5c6de7844481b4dc0ef4f172 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 2 Jul 2025 16:59:36 +0200 Subject: [PATCH 28/49] rcu: Enable rcu_normal_wake_from_gp on small systems Automatically enable the rcu_normal_wake_from_gp parameter on systems with a small number of CPUs. The activation threshold is set to 16 CPUs. This helps to reduce a latency of normal synchronize_rcu() API by waking up GP-waiters earlier and decoupling synchronize_rcu() callers from regular callback handling. A benchmark running 64 parallel jobs(system with 64 CPUs) invoking synchronize_rcu() demonstrates a notable latency reduction with the setting enabled. Latency distribution (microseconds): 0 - 9999 : 1 10000 - 19999 : 4 20000 - 29999 : 399 30000 - 39999 : 3197 40000 - 49999 : 10428 50000 - 59999 : 17363 60000 - 69999 : 15529 70000 - 79999 : 9287 80000 - 89999 : 4249 90000 - 99999 : 1915 100000 - 109999 : 922 110000 - 119999 : 390 120000 - 129999 : 187 ... 0 - 9999 : 1 10000 - 19999 : 234 20000 - 29999 : 6678 30000 - 39999 : 33463 40000 - 49999 : 20669 50000 - 59999 : 2766 60000 - 69999 : 183 ... Reviewed-by: Joel Fernandes Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f83bbb408895..8c22db759978 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1632,8 +1632,10 @@ static void rcu_sr_put_wait_head(struct llist_node *node) atomic_set_release(&sr_wn->inuse, 0); } -/* Disabled by default. */ -static int rcu_normal_wake_from_gp; +/* Enable rcu_normal_wake_from_gp automatically on small systems. */ +#define WAKE_FROM_GP_CPU_THRESHOLD 16 + +static int rcu_normal_wake_from_gp = -1; module_param(rcu_normal_wake_from_gp, int, 0644); static struct workqueue_struct *sync_wq; @@ -3250,7 +3252,7 @@ static void synchronize_rcu_normal(void) trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request")); - if (!READ_ONCE(rcu_normal_wake_from_gp)) { + if (READ_ONCE(rcu_normal_wake_from_gp) < 1) { wait_rcu_gp(call_rcu_hurry); goto trace_complete_out; } @@ -4854,6 +4856,12 @@ void __init rcu_init(void) sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0); WARN_ON(!sync_wq); + /* Respect if explicitly disabled via a boot parameter. */ + if (rcu_normal_wake_from_gp < 0) { + if (num_possible_cpus() <= WAKE_FROM_GP_CPU_THRESHOLD) + rcu_normal_wake_from_gp = 1; + } + /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ if (qovld < 0) From d827673d8a4e69937dd3731da2686a2d8206aef5 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 2 Jul 2025 16:59:37 +0200 Subject: [PATCH 29/49] Documentation/kernel-parameters: Update rcu_normal_wake_from_gp doc Update the documentation about rcu_normal_wake_from_gp parameter. Reviewed-by: Joel Fernandes Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay (AMD) --- Documentation/admin-guide/kernel-parameters.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1f2c0874da9..f7e4bee2b823 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5485,7 +5485,8 @@ echo 1 > /sys/module/rcutree/parameters/rcu_normal_wake_from_gp or pass a boot parameter "rcutree.rcu_normal_wake_from_gp=1" - Default is 0. + Default is 1 if num_possible_cpus() <= 16 and it is not explicitly + disabled by the boot parameter passing 0. rcuscale.gp_async= [KNL] Measure performance of asynchronous From b41642c87716bbd09797b1e4ea7d904f06c39b7b Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 8 Jul 2025 10:22:19 -0400 Subject: [PATCH 30/49] rcu: Fix rcu_read_unlock() deadloop due to IRQ work During rcu_read_unlock_special(), if this happens during irq_exit(), we can lockup if an IPI is issued. This is because the IPI itself triggers the irq_exit() path causing a recursive lock up. This is precisely what Xiongfeng found when invoking a BPF program on the trace_tick_stop() tracepoint As shown in the trace below. Fix by managing the irq_work state correctly. irq_exit() __irq_exit_rcu() /* in_hardirq() returns false after this */ preempt_count_sub(HARDIRQ_OFFSET) tick_irq_exit() tick_nohz_irq_exit() tick_nohz_stop_sched_tick() trace_tick_stop() /* a bpf prog is hooked on this trace point */ __bpf_trace_tick_stop() bpf_trace_run2() rcu_read_unlock_special() /* will send a IPI to itself */ irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); A simple reproducer can also be obtained by doing the following in tick_irq_exit(). It will hang on boot without the patch: static inline void tick_irq_exit(void) { + rcu_read_lock(); + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true); + rcu_read_unlock(); + Reported-by: Xiongfeng Wang Closes: https://lore.kernel.org/all/9acd5f9f-6732-7701-6880-4b51190aa070@huawei.com/ Tested-by: Qi Xi Signed-off-by: Joel Fernandes Reviewed-by: "Paul E. McKenney" Reported-by: Linux Kernel Functional Testing [neeraj: Apply Frederic's suggested fix for PREEMPT_RT] Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.h | 13 ++++++++++++- kernel/rcu/tree_plugin.h | 37 ++++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3830c19cf2f6..de6ca13a7b5f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -174,6 +174,17 @@ struct rcu_snap_record { unsigned long jiffies; /* Track jiffies value */ }; +/* + * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention. + * to report quiescent states at the soonest possible time. + * The request can be in one of the following states: + * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled. + * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it + * ran and we still haven't reported a quiescent state. + */ +#define DEFER_QS_IDLE 0 +#define DEFER_QS_PENDING 1 + /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ @@ -192,7 +203,7 @@ struct rcu_data { /* during and after the last grace */ /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ - bool defer_qs_iw_pending; /* Scheduler attention pending? */ + int defer_qs_iw_pending; /* Scheduler attention pending? */ struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a91b2322a0cd..b6f44871f774 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -486,13 +486,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) struct rcu_node *rnp; union rcu_special special; + rdp = this_cpu_ptr(&rcu_data); + if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING) + rdp->defer_qs_iw_pending = DEFER_QS_IDLE; + /* * If RCU core is waiting for this CPU to exit its critical section, * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; - rdp = this_cpu_ptr(&rcu_data); if (!special.s && !rdp->cpu_no_qs.b.exp) { local_irq_restore(flags); return; @@ -629,7 +632,23 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) rdp = container_of(iwp, struct rcu_data, defer_qs_iw); local_irq_save(flags); - rdp->defer_qs_iw_pending = false; + + /* + * If the IRQ work handler happens to run in the middle of RCU read-side + * critical section, it could be ineffective in getting the scheduler's + * attention to report a deferred quiescent state (the whole point of the + * IRQ work). For this reason, requeue the IRQ work. + * + * Basically, we want to avoid following situation: + * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING) + * 2. CPU enters new rcu_read_lock() + * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0 + * 4. rcu_read_unlock() does not re-queue work (state still PENDING) + * 5. Deferred QS reporting does not happen. + */ + if (rcu_preempt_depth() > 0) + WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE); + local_irq_restore(flags); } @@ -676,17 +695,13 @@ static void rcu_read_unlock_special(struct task_struct *t) set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { + expboost && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && + cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && - IS_ENABLED(CONFIG_PREEMPT_RT)) - rdp->defer_qs_iw = IRQ_WORK_INIT_HARD( - rcu_preempt_deferred_qs_handler); - else - init_irq_work(&rdp->defer_qs_iw, - rcu_preempt_deferred_qs_handler); - rdp->defer_qs_iw_pending = true; + rdp->defer_qs_iw = + IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); + rdp->defer_qs_iw_pending = DEFER_QS_PENDING; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } From a883f273431804adfac6048f5e71a041f5626f64 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 May 2025 15:30:01 -0700 Subject: [PATCH 31/49] torture: Provide EXPERT Kconfig option for arm64 KCSAN torture.sh runs The arm64 architecture requires that KCSAN-enabled kernels be built with the CONFIG_EXPERT=y Kconfig option. This commit therefore causes the torture.sh script to provide this option, but only for --kcsan runs on arm64 systems. Signed-off-by: Paul E. McKenney Cc: Marco Elver Cc: Dmitry Vyukov Cc: Catalin Marinas Cc: Will Deacon Cc: Cc: Acked-by: Will Deacon Signed-off-by: Neeraj Upadhyay (AMD) --- tools/testing/selftests/rcutorture/bin/torture.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 25847042e30e..420c551b824b 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -313,6 +313,13 @@ then do_scftorture=no fi +# CONFIG_EXPERT=y is currently required for arm64 KCSAN runs. +kcsan_expert= +if test "${thisarch}" = aarch64 +then + kcsan_expert="CONFIG_EXPERT=y" +fi + touch $T/failures touch $T/successes @@ -392,7 +399,7 @@ function torture_set { then chk_rdr_state="CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE=y" fi - torture_one "$@" --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y ${chk_rdr_state}" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan + torture_one "$@" --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y ${kcsan_expert} ${chk_rdr_state}" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan mv $T/last-resdir $T/last-resdir-kcsan || : fi } From ce243b71cfef7e44401c8b2ca93cdc206f8393d4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 May 2025 16:12:00 -0700 Subject: [PATCH 32/49] torture: Suppress "find" diagnostics from torture.sh --do-none run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When torture.sh is told to do nothing, it produces a couple of distracting diagnostics from the "find" command: find: ‘’: No such file or directory find: ‘’: No such file or directory This is pointless chatter and could cause confusion. This commit therefore suppresses these diagnostics when there is nothing to find. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- tools/testing/selftests/rcutorture/bin/torture.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 420c551b824b..ed59bd43d4f8 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -719,8 +719,11 @@ fi echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log echo Summary: Successes: $nsuccesses Failures: $nfailures. | tee -a $T/log tdir="`cat $T/successes $T/failures | head -1 | awk '{ print $NF }' | sed -e 's,/[^/]\+/*$,,'`" -find "$tdir" -name 'ConfigFragment.diags' -print > $T/configerrors -find "$tdir" -name 'Make.out.diags' -print > $T/builderrors +if test -n "$tdir" +then + find "$tdir" -name 'ConfigFragment.diags' -print > $T/configerrors + find "$tdir" -name 'Make.out.diags' -print > $T/builderrors +fi if test -s "$T/configerrors" then echo " Scenarios with .config errors: `wc -l "$T/configerrors" | awk '{ print $1 }'`" From 0783f216423fff1acafae3b464b95e05ac596e12 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Jun 2025 18:03:53 -0700 Subject: [PATCH 33/49] torture: Extract testid.txt generation to separate script The kvm.sh script places a testid.txt file in the top-level results directory in order to identify the tree and commit that was tested. This works well, but there are scripts other than kvm.sh that also create results directories, and it would be good for them to also identify exactly what was tested. This commit therefore extracts the testid.txt generation to a new mktestid.sh script so that it can be easily used elsewhere. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- tools/testing/selftests/rcutorture/bin/kvm.sh | 13 +-------- .../selftests/rcutorture/bin/mktestid.sh | 29 +++++++++++++++++++ 2 files changed, 30 insertions(+), 12 deletions(-) create mode 100755 tools/testing/selftests/rcutorture/bin/mktestid.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 9c1b850b3227..617cba339d28 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -442,18 +442,7 @@ echo $scriptname $args touch $resdir/$ds/log echo $scriptname $args >> $resdir/$ds/log echo ${TORTURE_SUITE} > $resdir/$ds/torture_suite -echo Build directory: `pwd` > $resdir/$ds/testid.txt -if test -d .git -then - echo Current commit: `git rev-parse HEAD` >> $resdir/$ds/testid.txt - echo >> $resdir/$ds/testid.txt - echo ' ---' Output of "'"git status"'": >> $resdir/$ds/testid.txt - git status >> $resdir/$ds/testid.txt - echo >> $resdir/$ds/testid.txt - echo >> $resdir/$ds/testid.txt - echo ' ---' Output of "'"git diff HEAD"'": >> $resdir/$ds/testid.txt - git diff HEAD >> $resdir/$ds/testid.txt -fi +mktestid.sh $resdir/$ds ___EOF___ kvm-assign-cpus.sh /sys/devices/system/node > $T/cpuarray.awk kvm-get-cpus-script.sh $T/cpuarray.awk $T/dumpbatches.awk diff --git a/tools/testing/selftests/rcutorture/bin/mktestid.sh b/tools/testing/selftests/rcutorture/bin/mktestid.sh new file mode 100755 index 000000000000..16f9907a4dae --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/mktestid.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Create a testid.txt file in the specified directory. +# +# Usage: mktestid.sh dirpath +# +# Copyright (C) Meta Platforms, Inc. 2025 +# +# Author: Paul E. McKenney + +resdir="$1" +if test -z "${resdir}" || ! test -d "${resdir}" || ! test -w "${resdir}" +then + echo Path '"'${resdir}'"' not writeable directory, no ${resdir}/testid.txt. + exit 1 +fi +echo Build directory: `pwd` > ${resdir}/testid.txt +if test -d .git +then + echo Current commit: `git rev-parse HEAD` >> ${resdir}/testid.txt + echo >> ${resdir}/testid.txt + echo ' ---' Output of "'"git status"'": >> ${resdir}/testid.txt + git status >> ${resdir}/testid.txt + echo >> ${resdir}/testid.txt + echo >> ${resdir}/testid.txt + echo ' ---' Output of "'"git diff HEAD"'": >> ${resdir}/testid.txt + git diff HEAD >> ${resdir}/testid.txt +fi From d57300010d38a8eb518fa05d305bef1343716431 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Jun 2025 19:35:13 -0700 Subject: [PATCH 34/49] torture: Add textid.txt file to --do-allmodconfig and --do-rcu-rust runs This commit causes the torture.sh --do-allmodconfig and --do-rcu-rust parameters to add testid.txt files to their results directories, thus allowing easier analysis of the results of a series of runs kicked off by "git bisect". Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- tools/testing/selftests/rcutorture/bin/torture.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index ed59bd43d4f8..a7a8e801283d 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -410,6 +410,7 @@ then echo " --- allmodconfig:" Start `date` | tee -a $T/log amcdir="tools/testing/selftests/rcutorture/res/$ds/allmodconfig" mkdir -p "$amcdir" + mktestid.sh "$amcdir" echo " --- make clean" | tee $amcdir/log > "$amcdir/Make.out" 2>&1 make -j$MAKE_ALLOTED_CPUS clean >> "$amcdir/Make.out" 2>&1 retcode=$? @@ -516,6 +517,7 @@ then echo " --- do-rcu-rust:" Start `date` | tee -a $T/log rrdir="tools/testing/selftests/rcutorture/res/$ds/results-rcu-rust" mkdir -p "$rrdir" + mktestid.sh "$rrdir" echo " --- make LLVM=1 rustavailable " | tee -a $rrdir/log > $rrdir/rustavailable.out make LLVM=1 rustavailable > $T/rustavailable.out 2>&1 retcode=$? From 3aee453496026451fe126e53d43db6d687b51209 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Jun 2025 07:19:34 -0700 Subject: [PATCH 35/49] torture: Make torture.sh tolerate runs having bad kvm.sh arguments Currently, torture.sh assumes excessive levels of reviewer competence and thus fails to gracefully handle cases where it is tricked into giving kvm.sh invalid arguments. This commit therefore upgrades error handling to more gracefully handle this situation. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- .../selftests/rcutorture/bin/torture.sh | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index a7a8e801283d..39844d213da5 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -378,13 +378,19 @@ function torture_set { then curflavor=$flavor torture_one "$@" - mv $T/last-resdir $T/last-resdir-nodebug || : + if test -e $T/last-resdir + then + mv $T/last-resdir $T/last-resdir-nodebug || : + fi fi if test "$do_kasan" = "yes" then curflavor=${flavor}-kasan torture_one "$@" --kasan - mv $T/last-resdir $T/last-resdir-kasan || : + if test -e $T/last-resdir + then + mv $T/last-resdir $T/last-resdir-kasan || : + fi fi if test "$do_kcsan" = "yes" then @@ -400,7 +406,10 @@ function torture_set { chk_rdr_state="CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE=y" fi torture_one "$@" --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y ${kcsan_expert} ${chk_rdr_state}" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan - mv $T/last-resdir $T/last-resdir-kcsan || : + if test -e $T/last-resdir + then + mv $T/last-resdir $T/last-resdir-kcsan || : + fi fi } @@ -704,7 +713,14 @@ nfailures=0 echo FAILURES: | tee -a $T/log if test -s "$T/failures" then - awk < "$T/failures" -v sq="'" '{ print "echo " sq $0 sq; print "sed -e " sq "1,/^ --- .* Test summary:$/d" sq " " $2 "/log | grep Summary: | sed -e " sq "s/^[^S]*/ /" sq; }' | sh | tee -a $T/log | tee "$T/failuresum" + awk < "$T/failures" -v sq="'" ' + { + print "echo " sq $0 sq; + if ($2 != "") + print "sed -e " sq "1,/^ --- .* Test summary:$/d" sq " " $2 "/log | grep Summary: | sed -e " sq "s/^[^S]*/ /" sq; + else + print "echo " sq " " sq "Run failed to produce results directory."; + }' | sh | tee -a $T/log | tee "$T/failuresum" nfailures="`wc -l "$T/failures" | awk '{ print $1 }'`" grep "^ Summary: " "$T/failuresum" | grep -v '^ Summary: Bugs: [0-9]* (all bugs kcsan)$' > "$T/nonkcsan" @@ -714,13 +730,13 @@ then fi ret=2 fi -if test "$do_kcsan" = "yes" +if test "$do_kcsan" = "yes" && test -e tools/testing/selftests/rcutorture/res/$ds then TORTURE_KCONFIG_KCSAN_ARG=1 tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh tools/testing/selftests/rcutorture/res/$ds > tools/testing/selftests/rcutorture/res/$ds/kcsan.sum fi echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log echo Summary: Successes: $nsuccesses Failures: $nfailures. | tee -a $T/log -tdir="`cat $T/successes $T/failures | head -1 | awk '{ print $NF }' | sed -e 's,/[^/]\+/*$,,'`" +tdir="`cat $T/successes $T/failures | awk 'NF > 1 { print $NF }' | head -1 | sed -e 's,/[^/]\+/*$,,'`" if test -n "$tdir" then find "$tdir" -name 'ConfigFragment.diags' -print > $T/configerrors From 17f4698a9e6092dd59e5dd616b21095ba9c8b6fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Jun 2025 14:36:04 -0700 Subject: [PATCH 36/49] torture: Add "ERROR" diagnostic for testing kernel-build output Some recent kernel-build failures have featured "ERROR", so this commit adds it to the list checked by kvm-build.sh. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- tools/testing/selftests/rcutorture/bin/kvm-build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh index 11f8d232b0ee..3edfd064ef81 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh @@ -44,7 +44,7 @@ fi ncpus="`getconf _NPROCESSORS_ONLN`" make -j$((2 * ncpus)) $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1 retval=$? -if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | grep -E -q "Stop|Error|error:|warning:" || grep -E -q "Stop|Error|error:" < $resdir/Make.out +if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | grep -E -q "Stop|ERROR|Error|error:|warning:" || grep -E -q "Stop|ERROR|Error|error:" < $resdir/Make.out then echo Kernel build error grep -E "Stop|Error|error:|warning:" < $resdir/Make.out From 748d7923b53ff7bcb3d825ef765d8461d7af1794 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Jun 2025 03:47:11 -0700 Subject: [PATCH 37/49] torture: Make torture.sh --allmodconfig testing fail on warnings Currently, the torture.sh --allmodconfig testing looks solely at the exit code from the kernel build, and thus fails to flag many compiler warnings. This commit therefore checks the kernel-build output for compiler diagnostics. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- tools/testing/selftests/rcutorture/bin/torture.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 39844d213da5..611bc03a8dc7 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -438,6 +438,10 @@ then make -j$MAKE_ALLOTED_CPUS >> "$amcdir/Make.out" 2>&1 retcode="$?" echo $retcode > "$amcdir/Make.exitcode" + if grep -E -q "Stop|ERROR|Error|error:|warning:" < "$amcdir/Make.out" + then + retcode=99 + fi buildphase='"make"' fi if test "$retcode" -eq 0 From cbd5d35e6ddc47b4cde5c96a0d5f00da0f8e881f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2025 17:23:26 -0700 Subject: [PATCH 38/49] torture: Remove support for SRCU-lite Because SRCU-lite is being replaced by SRCU-fast, this commit removes support for SRCU-lite from refscale.c. Both SRCU-lite and SRCU-fast provide faster readers by dropping the smp_mb() call from their lock and unlock primitives, but incur a pair of added RCU grace periods during the SRCU grace period. There is a trivial mapping from the SRCU-lite API to that of SRCU-fast, so there should be no transition issues. [ paulmck: Apply Christoph Hellwig feedback. ] Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/refscale.c | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index f11a7c2af778..f4b2cea1cce5 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -246,36 +246,6 @@ static const struct ref_scale_ops srcu_fast_ops = { .name = "srcu-fast" }; -static void srcu_lite_ref_scale_read_section(const int nloops) -{ - int i; - int idx; - - for (i = nloops; i >= 0; i--) { - idx = srcu_read_lock_lite(srcu_ctlp); - srcu_read_unlock_lite(srcu_ctlp, idx); - } -} - -static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - int idx; - - for (i = nloops; i >= 0; i--) { - idx = srcu_read_lock_lite(srcu_ctlp); - un_delay(udl, ndl); - srcu_read_unlock_lite(srcu_ctlp, idx); - } -} - -static const struct ref_scale_ops srcu_lite_ops = { - .init = rcu_sync_scale_init, - .readsection = srcu_lite_ref_scale_read_section, - .delaysection = srcu_lite_ref_scale_delay_section, - .name = "srcu-lite" -}; - #ifdef CONFIG_TASKS_RCU // Definitions for RCU Tasks ref scale testing: Empty read markers. @@ -1193,7 +1163,7 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &rcu_ops, &srcu_ops, &srcu_fast_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, From d08d409126d7d5ad2d8ceac77fb97f2d892e9f85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2025 17:23:27 -0700 Subject: [PATCH 39/49] rcutorture: Remove SRCU-lite scenarios This commit prepares for the removal of SRCU-Lite by removing the SRCU-L rcutorture scenario that tests it. Both SRCU-lite and SRCU-fast provide faster readers by dropping the smp_mb() call from their lock and unlock primitives, but incur a pair of added RCU grace periods during the SRCU grace period. There is a trivial mapping from the SRCU-lite API to that of SRCU-fast, so there should be no transition issues. [ paulmck: Apply Christoph Hellwig feedback. ] Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay (AMD) --- tools/testing/selftests/rcutorture/configs/rcu/CFLIST | 1 - tools/testing/selftests/rcutorture/configs/rcu/SRCU-L | 10 ---------- .../selftests/rcutorture/configs/rcu/SRCU-L.boot | 3 --- 3 files changed, 14 deletions(-) delete mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-L delete mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST index 45f572570a8c..98b6175e5aa0 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST @@ -5,7 +5,6 @@ TREE04 TREE05 TREE07 TREE09 -SRCU-L SRCU-N SRCU-P SRCU-T diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L deleted file mode 100644 index 3b4fa8dbef8a..000000000000 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L +++ /dev/null @@ -1,10 +0,0 @@ -CONFIG_RCU_TRACE=n -CONFIG_SMP=y -CONFIG_NR_CPUS=6 -CONFIG_HOTPLUG_CPU=y -CONFIG_PREEMPT_NONE=y -CONFIG_PREEMPT_VOLUNTARY=n -CONFIG_PREEMPT=n -#CHECK#CONFIG_RCU_EXPERT=n -CONFIG_KPROBES=n -CONFIG_FTRACE=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot deleted file mode 100644 index 0207b3138c5b..000000000000 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-L.boot +++ /dev/null @@ -1,3 +0,0 @@ -rcutorture.torture_type=srcu -rcutorture.reader_flavor=0x4 -rcutorture.fwd_progress=3 From 941ab0b369c983f7867de54c8579fd7f1676ee3c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2025 17:23:28 -0700 Subject: [PATCH 40/49] rcutorture: Remove support for SRCU-lite Because SRCU-lite is being replaced by SRCU-fast, this commit removes support for SRCU-lite from rcutorture.c Both SRCU-lite and SRCU-fast provide faster readers by dropping the smp_mb() call from their lock and unlock primitives, but incur a pair of added RCU grace periods during the SRCU grace period. There is a trivial mapping from the SRCU-lite API to that of SRCU-fast, so there should be no transition issues. [ paulmck: Apply Christoph Hellwig feedback. ] Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay (AMD) --- include/linux/srcu.h | 2 +- kernel/rcu/rcutorture.c | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 900b0d5c05f5..c20dacb563e5 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -49,7 +49,7 @@ int init_srcu_struct(struct srcu_struct *ssp); #define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). #define SRCU_READ_FLAVOR_FAST 0x8 // srcu_read_lock_fast(). #define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ - SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) // All of the above. + SRCU_READ_FLAVOR_FAST) // All of the above. #define SRCU_READ_FLAVOR_SLOWGP (SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) // Flavors requiring synchronize_rcu() // instead of smp_mb(). diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 213f23f20a64..7a893d51d02b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -718,11 +718,6 @@ static int srcu_torture_read_lock(void) WARN_ON_ONCE(idx & ~0x1); ret += idx << 1; } - if (reader_flavor & SRCU_READ_FLAVOR_LITE) { - idx = srcu_read_lock_lite(srcu_ctlp); - WARN_ON_ONCE(idx & ~0x1); - ret += idx << 2; - } if (reader_flavor & SRCU_READ_FLAVOR_FAST) { scp = srcu_read_lock_fast(srcu_ctlp); idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); @@ -756,8 +751,6 @@ static void srcu_torture_read_unlock(int idx) WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); if (reader_flavor & SRCU_READ_FLAVOR_FAST) srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); - if (reader_flavor & SRCU_READ_FLAVOR_LITE) - srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); if (reader_flavor & SRCU_READ_FLAVOR_NMI) srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) From 3aea745a2a82e8397d2f30326f0dcf5f375dd7c8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Jun 2025 13:49:53 -0700 Subject: [PATCH 41/49] srcu: Expedite SRCU-fast grace periods Currently, SRCU-fast grace periods use synchronize_rcu() to provide the needed ordering with readers, even given an expedited SRCU-fast grace period, which isn't all that expedited. This commit therefore instead uses synchronize_rcu_expedited() if there is an expedited SRCU-fast grace period in flight. Of course, given an non-expedited SRCU-fast grace period blocked in synchronize_rcu(), a later request for an expedited SRCU-fast grace period will wait for that synchronize_rcu() to return before switching to use of synchronize_rcu_expedited(). If this turns out to be a real problem for a production workload, we can increase the complexity (but likely also degrade the energy efficiency) to speed things up further. Signed-off-by: Paul E. McKenney Cc: Andrii Nakryiko Cc: Alexei Starovoitov Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/srcutree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 48047260697e..c5e8ebc493d5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -502,6 +502,8 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) */ if (!did_gp) smp_mb(); /* A */ + else if (srcu_gp_is_expedited(ssp)) + synchronize_rcu_expedited(); /* X */ else synchronize_rcu(); /* X */ From 623baa01d5b43ca06ba337751d9a4f62199d1715 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2025 17:23:29 -0700 Subject: [PATCH 42/49] srcu: Remove SRCU-lite implementation This commit removes the SRCU-lite implementation, which has been replaced by SRCU-fast. Both SRCU-lite and SRCU-fast provide faster readers by dropping the smp_mb() call from their lock and unlock primitives, but incur a pair of added RCU grace periods during the SRCU grace period. There is a trivial mapping from the SRCU-lite API to that of SRCU-fast, so there should be no transition issues. [ paulmck: Apply Christoph Hellwig feedback. ] Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay (AMD) --- include/linux/srcu.h | 47 ++-------------------------------------- include/linux/srcutiny.h | 3 --- include/linux/srcutree.h | 38 -------------------------------- 3 files changed, 2 insertions(+), 86 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index c20dacb563e5..cf711a0f440b 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -46,11 +46,11 @@ int init_srcu_struct(struct srcu_struct *ssp); /* Values for SRCU Tree srcu_data ->srcu_reader_flavor, but also used by rcutorture. */ #define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). #define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). -#define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). +// 0x4 // SRCU-lite is no longer with us. #define SRCU_READ_FLAVOR_FAST 0x8 // srcu_read_lock_fast(). #define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ SRCU_READ_FLAVOR_FAST) // All of the above. -#define SRCU_READ_FLAVOR_SLOWGP (SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) +#define SRCU_READ_FLAVOR_SLOWGP SRCU_READ_FLAVOR_FAST // Flavors requiring synchronize_rcu() // instead of smp_mb(). void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); @@ -299,33 +299,6 @@ static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct * return __srcu_read_lock_fast(ssp); } -/** - * srcu_read_lock_lite - register a new reader for an SRCU-protected structure. - * @ssp: srcu_struct in which to register the new reader. - * - * Enter an SRCU read-side critical section, but for a light-weight - * smp_mb()-free reader. See srcu_read_lock() for more information. - * - * If srcu_read_lock_lite() is ever used on an srcu_struct structure, - * then none of the other flavors may be used, whether before, during, - * or after. Note that grace-period auto-expediting is disabled for _lite - * srcu_struct structures because auto-expedited grace periods invoke - * synchronize_rcu_expedited(), IPIs and all. - * - * Note that srcu_read_lock_lite() can be invoked only from those contexts - * where RCU is watching, that is, from contexts where it would be legal - * to invoke rcu_read_lock(). Otherwise, lockdep will complain. - */ -static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp) -{ - int retval; - - srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_LITE); - retval = __srcu_read_lock_lite(ssp); - rcu_try_lock_acquire(&ssp->dep_map); - return retval; -} - /** * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. @@ -434,22 +407,6 @@ static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __ __srcu_read_unlock_fast(ssp, scp); } -/** - * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure. - * @ssp: srcu_struct in which to unregister the old reader. - * @idx: return value from corresponding srcu_read_lock_lite(). - * - * Exit a light-weight SRCU read-side critical section. - */ -static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) - __releases(ssp) -{ - WARN_ON_ONCE(idx & ~0x1); - srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); - srcu_lock_release(&ssp->dep_map); - __srcu_read_unlock_lite(ssp, idx); -} - /** * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 380260317d98..51ce25f07930 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -93,9 +93,6 @@ static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ __srcu_read_unlock(ssp, __srcu_ptr_to_ctr(ssp, scp)); } -#define __srcu_read_lock_lite __srcu_read_lock -#define __srcu_read_unlock_lite __srcu_read_unlock - static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) { synchronize_srcu(ssp); diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 8bed7e6cc4c1..bf44d8d1e69e 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -278,44 +278,6 @@ static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); } -/* - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Returns an index that must be passed to the matching - * srcu_read_unlock_lite(). - * - * Note that this_cpu_inc() is an RCU read-side critical section either - * because it disables interrupts, because it is a single instruction, - * or because it is a read-modify-write atomic operation, depending on - * the whims of the architecture. - */ -static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) -{ - struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); - - RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); - this_cpu_inc(scp->srcu_locks.counter); /* Y */ - barrier(); /* Avoid leaking the critical section. */ - return __srcu_ptr_to_ctr(ssp, scp); -} - -/* - * Removes the count for the old reader from the appropriate - * per-CPU element of the srcu_struct. Note that this may well be a - * different CPU than that which was incremented by the corresponding - * srcu_read_lock_lite(), but it must be within the same task. - * - * Note that this_cpu_inc() is an RCU read-side critical section either - * because it disables interrupts, because it is a single instruction, - * or because it is a read-modify-write atomic operation, depending on - * the whims of the architecture. - */ -static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) -{ - barrier(); /* Avoid leaking the critical section. */ - this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); /* Z */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); -} - void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); // Record reader usage even for CONFIG_PROVE_RCU=n kernels. This is From 2a73ebf267fe26fb1fb5c8f085be986dfe9faf83 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2025 17:23:30 -0700 Subject: [PATCH 43/49] checkpatch: Remove SRCU-lite deprecation Now that SRCU-lite has been removed from the kernel, let's remove the now-redundant deprecation from checkpatch.pl. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay (AMD) --- scripts/checkpatch.pl | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 664f7b7a622c..867c7b6fd839 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -857,8 +857,6 @@ our %deprecated_apis = ( "kunmap" => "kunmap_local", "kmap_atomic" => "kmap_local_page", "kunmap_atomic" => "kunmap_local", - "srcu_read_lock_lite" => "srcu_read_lock_fast", - "srcu_read_unlock_lite" => "srcu_read_unlock_fast", ); #Create a search pattern for all these strings to speed up a loop below From 908a97eba8c8b510996bf5d77d1e3070d59caa6d Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 15 Jul 2025 16:01:52 -0400 Subject: [PATCH 44/49] rcu: Refactor expedited handling check in rcu_read_unlock_special() Extract the complex expedited handling condition in rcu_read_unlock_special() into a separate function rcu_unlock_needs_exp_handling() with detailed comments explaining each condition. This improves code readability. No functional change intended. Reviewed-by: "Paul E. McKenney" Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_plugin.h | 83 +++++++++++++++++++++++++++++++++++----- 1 file changed, 74 insertions(+), 9 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b6f44871f774..c71d9d10780d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -652,6 +652,75 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) local_irq_restore(flags); } +/* + * Check if expedited grace period processing during unlock is needed. + * + * This function determines whether expedited handling is required based on: + * 1. Task blocking an expedited grace period (based on a heuristic, could be + * false-positive, see below.) + * 2. CPU participating in an expedited grace period + * 3. Strict grace period mode requiring expedited handling + * 4. RCU priority deboosting needs when interrupts were disabled + * + * @t: The task being checked + * @rdp: The per-CPU RCU data + * @rnp: The RCU node for this CPU + * @irqs_were_disabled: Whether interrupts were disabled before rcu_read_unlock() + * + * Returns true if expedited processing of the rcu_read_unlock() is needed. + */ +static bool rcu_unlock_needs_exp_handling(struct task_struct *t, + struct rcu_data *rdp, + struct rcu_node *rnp, + bool irqs_were_disabled) +{ + /* + * Check if this task is blocking an expedited grace period. If the + * task was preempted within an RCU read-side critical section and is + * on the expedited grace period blockers list (exp_tasks), we need + * expedited handling to unblock the expedited GP. This is not an exact + * check because 't' might not be on the exp_tasks list at all - its + * just a fast heuristic that can be false-positive sometimes. + */ + if (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) + return true; + + /* + * Check if this CPU is participating in an expedited grace period. + * The expmask bitmap tracks which CPUs need to check in for the + * current expedited GP. If our CPU's bit is set, we need expedited + * handling to help complete the expedited GP. + */ + if (rdp->grpmask & READ_ONCE(rnp->expmask)) + return true; + + /* + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, all grace periods + * are treated as short for testing purposes even if that means + * disturbing the system more. Check if either: + * - This CPU has not yet reported a quiescent state, or + * - This task was preempted within an RCU critical section + * In either case, require expedited handling for strict GP mode. + */ + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) + return true; + + /* + * RCU priority boosting case: If a task is subject to RCU priority + * boosting and exits an RCU read-side critical section with interrupts + * disabled, we need expedited handling to ensure timely deboosting. + * Without this, a low-priority task could incorrectly run at high + * real-time priority for an extended period degrading real-time + * responsiveness. This applies to all CONFIG_RCU_BOOST=y kernels, + * not just to PREEMPT_RT. + */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node) + return true; + + return false; +} + /* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU @@ -671,18 +740,14 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - bool expboost; // Expedited GP in flight or possible boosting. + bool needs_exp; // Expedited handling needed. struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || - (rdp->grpmask & READ_ONCE(rnp->expmask)) || - (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && - ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) || - (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && - t->rcu_blocked_node); + needs_exp = rcu_unlock_needs_exp_handling(t, rdp, rnp, irqs_were_disabled); + // Need to defer quiescent state until everything is enabled. - if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) { + if (use_softirq && (in_hardirq() || (needs_exp && !irqs_were_disabled))) { // Using softirq, safe to awaken, and either the // wakeup is free or there is either an expedited // GP in flight or a potential need to deboost. @@ -695,7 +760,7 @@ static void rcu_read_unlock_special(struct task_struct *t) set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - expboost && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && + needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. From 463d46044f04013306a4893242f65788b8a16b2e Mon Sep 17 00:00:00 2001 From: Tze-nan Wu Date: Thu, 17 Jul 2025 13:53:38 +0800 Subject: [PATCH 45/49] rcu: Fix delayed execution of hurry callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We observed a regression in our customer’s environment after enabling CONFIG_LAZY_RCU. In the Android Update Engine scenario, where ioctl() is used heavily, we found that callbacks queued via call_rcu_hurry (such as percpu_ref_switch_to_atomic_rcu) can sometimes be delayed by up to 5 seconds before execution. This occurs because the new grace period does not start immediately after the previous one completes. The root cause is that the wake_nocb_gp_defer() function now checks "rdp->nocb_defer_wakeup" instead of "rdp_gp->nocb_defer_wakeup". On CPUs that are not rcuog, "rdp->nocb_defer_wakeup" may always be RCU_NOCB_WAKE_NOT. This can cause "rdp_gp->nocb_defer_wakeup" to be downgraded and the "rdp_gp->nocb_timer" to be postponed by up to 10 seconds, delaying the execution of hurry RCU callbacks. The trace log of one scenario we encountered is as follow: // previous GP ends at this point rcu_preempt [000] d..1. 137.240210: rcu_grace_period: rcu_preempt 8369 end rcu_preempt [000] ..... 137.240212: rcu_grace_period: rcu_preempt 8372 reqwait // call_rcu_hurry enqueues "percpu_ref_switch_to_atomic_rcu", the callback waited on by UpdateEngine update_engine [002] d..1. 137.301593: __call_rcu_common: wyy: unlikely p_ref = 00000000********. lazy = 0 // FirstQ on cpu 2 rdp_gp->nocb_timer is set to fire after 1 jiffy (4ms) // and the rdp_gp->nocb_defer_wakeup is set to RCU_NOCB_WAKE update_engine [002] d..2. 137.301595: rcu_nocb_wake: rcu_preempt 2 FirstQ on cpu2 with rdp_gp (cpu0). // FirstBQ event on cpu2 during the 1 jiffy, make the timer postpond 10 seconds later. // also, the rdp_gp->nocb_defer_wakeup is overwrite to RCU_NOCB_WAKE_LAZY update_engine [002] d..1. 137.301601: rcu_nocb_wake: rcu_preempt 2 WakeEmptyIsDeferred ... ... ... // before the 10 seconds timeout, cpu0 received another call_rcu_hurry // reset the timer to jiffies+1 and set the waketype = RCU_NOCB_WAKE. kworker/u32:0 [000] d..2. 142.557564: rcu_nocb_wake: rcu_preempt 0 FirstQ kworker/u32:0 [000] d..1. 142.557576: rcu_nocb_wake: rcu_preempt 0 WakeEmptyIsDeferred kworker/u32:0 [000] d..1. 142.558296: rcu_nocb_wake: rcu_preempt 0 WakeNot kworker/u32:0 [000] d..1. 142.558562: rcu_nocb_wake: rcu_preempt 0 WakeNot // idle(do_nocb_deferred_wakeup) wake rcuog due to waketype == RCU_NOCB_WAKE [000] d..1. 142.558786: rcu_nocb_wake: rcu_preempt 0 DoWake [000] dN.1. 142.558839: rcu_nocb_wake: rcu_preempt 0 DeferredWake rcuog/0 [000] ..... 142.558871: rcu_nocb_wake: rcu_preempt 0 EndSleep rcuog/0 [000] ..... 142.558877: rcu_nocb_wake: rcu_preempt 0 Check // finally rcuog request a new GP at this point (5 seconds after the FirstQ event) rcuog/0 [000] d..2. 142.558886: rcu_grace_period: rcu_preempt 8372 newreq rcu_preempt [001] d..1. 142.559458: rcu_grace_period: rcu_preempt 8373 start ... rcu_preempt [000] d..1. 142.564258: rcu_grace_period: rcu_preempt 8373 end rcuop/2 [000] D..1. 142.566337: rcu_batch_start: rcu_preempt CBs=219 bl=10 // the hurry CB is invoked at this point rcuop/2 [000] b.... 142.566352: blk_queue_usage_counter_release: wyy: wakeup. p_ref = 00000000********. This patch changes the condition to check "rdp_gp->nocb_defer_wakeup" in the lazy path. This prevents an already scheduled "rdp_gp->nocb_timer" from being postponed and avoids overwriting "rdp_gp->nocb_defer_wakeup" when it is not RCU_NOCB_WAKE_NOT. Fixes: 3cb278e73be5 ("rcu: Make call_rcu() lazy to save power") Co-developed-by: Cheng-jui Wang Signed-off-by: Cheng-jui Wang Co-developed-by: Lorry.Luo@mediatek.com Signed-off-by: Lorry.Luo@mediatek.com Tested-by: weiyangyang@vivo.com Signed-off-by: weiyangyang@vivo.com Signed-off-by: Tze-nan Wu Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_nocb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 08eb9b0e2fab..e6cd56603cad 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -276,7 +276,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, * callback storms, no need to wake up too early. */ if (waketype == RCU_NOCB_WAKE_LAZY && - rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { + rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush()); WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); } else if (waketype == RCU_NOCB_WAKE_BYPASS) { From 954c0d74129948eed5c8f4a6898d3d5b344c8b18 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Jul 2025 11:55:41 -0700 Subject: [PATCH 46/49] srcu: Add guards for SRCU-fast readers This adds the usual scoped_guard(srcu_fast, &my_srcu) and guard(srcu_fast)(&my_srcu). Suggested-by: Steven Rostedt Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Sebastian Andrzej Siewior Reviewed-by: Steven Rostedt (Google) Signed-off-by: Neeraj Upadhyay (AMD) --- include/linux/srcu.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index cf711a0f440b..f179700fecaf 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -481,4 +481,9 @@ DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct, srcu_read_unlock(_T->lock, _T->idx), int idx) +DEFINE_LOCK_GUARD_1(srcu_fast, struct srcu_struct, + _T->scp = srcu_read_lock_fast(_T->lock), + srcu_read_unlock_fast(_T->lock, _T->scp), + struct srcu_ctr __percpu *scp) + #endif From 30a7806adab5f6b971cf07439ed6a3fac3fd80cf Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 15 Jul 2025 16:01:53 -0400 Subject: [PATCH 47/49] rcu: Document GP init vs hotplug-scan ordering requirements Add detailed comments explaining the critical ordering constraints during RCU grace period initialization, based on discussions with Frederic. Reviewed-by: "Paul E. McKenney" Co-developed-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- .../RCU/Design/Requirements/Requirements.rst | 41 +++++++++++++++++++ kernel/rcu/tree.c | 8 ++++ 2 files changed, 49 insertions(+) diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index 6125e7068d2c..771a1ce4c84d 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -1970,6 +1970,47 @@ corresponding CPU's leaf node lock is held. This avoids race conditions between RCU's hotplug notifier hooks, the grace period initialization code, and the FQS loop, all of which refer to or modify this bookkeeping. +Note that grace period initialization (rcu_gp_init()) must carefully sequence +CPU hotplug scanning with grace period state changes. For example, the +following race could occur in rcu_gp_init() if rcu_seq_start() were to happen +after the CPU hotplug scanning. + +.. code-block:: none + + CPU0 (rcu_gp_init) CPU1 CPU2 + --------------------- ---- ---- + // Hotplug scan first (WRONG ORDER) + rcu_for_each_leaf_node(rnp) { + rnp->qsmaskinit = rnp->qsmaskinitnext; + } + rcutree_report_cpu_starting() + rnp->qsmaskinitnext |= mask; + rcu_read_lock() + r0 = *X; + r1 = *X; + X = NULL; + cookie = get_state_synchronize_rcu(); + // cookie = 8 (future GP) + rcu_seq_start(&rcu_state.gp_seq); + // gp_seq = 5 + + // CPU1 now invisible to this GP! + rcu_for_each_node_breadth_first() { + rnp->qsmask = rnp->qsmaskinit; + // CPU1 not included! + } + + // GP completes without CPU1 + rcu_seq_end(&rcu_state.gp_seq); + // gp_seq = 8 + poll_state_synchronize_rcu(cookie); + // Returns true! + kfree(r1); + r2 = *r0; // USE-AFTER-FREE! + +By incrementing gp_seq first, CPU1's RCU read-side critical section +is guaranteed to not be missed by CPU2. + Scheduler and RCU ~~~~~~~~~~~~~~~~~ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8c22db759978..384af6b5cf9c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1838,6 +1838,14 @@ static noinline_for_stack bool rcu_gp_init(void) start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ old_gp_seq = rcu_state.gp_seq; + /* + * Critical ordering: rcu_seq_start() must happen BEFORE the CPU hotplug + * scan below. Otherwise we risk a race where a newly onlining CPU could + * be missed by the current grace period, potentially leading to + * use-after-free errors. For a detailed explanation of this race, see + * Documentation/RCU/Design/Requirements/Requirements.rst in the + * "Hotplug CPU" section. + */ rcu_seq_start(&rcu_state.gp_seq); /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */ WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && From 186779c036468038b0d077ec5333a51512f867e5 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 15 Jul 2025 16:01:54 -0400 Subject: [PATCH 48/49] rcu: Document separation of rcu_state and rnp's gp_seq The details of this are subtle and was discussed recently. Add a quick-quiz about this and refer to it from the code, for more clarity. Reviewed-by: "Paul E. McKenney" Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- .../Data-Structures/Data-Structures.rst | 33 +++++++++++++++++++ kernel/rcu/tree.c | 4 +++ 2 files changed, 37 insertions(+) diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst index 04e16775c752..1b0aad184dd7 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst @@ -286,6 +286,39 @@ in order to detect the beginnings and ends of grace periods in a distributed fashion. The values flow from ``rcu_state`` to ``rcu_node`` (down the tree from the root to the leaves) to ``rcu_data``. ++-----------------------------------------------------------------------+ +| **Quick Quiz**: | ++-----------------------------------------------------------------------+ +| Given that the root rcu_node structure has a gp_seq field, | +| why does RCU maintain a separate gp_seq in the rcu_state structure? | +| Why not just use the root rcu_node's gp_seq as the official record | +| and update it directly when starting a new grace period? | ++-----------------------------------------------------------------------+ +| **Answer**: | ++-----------------------------------------------------------------------+ +| On single-node RCU trees (where the root node is also a leaf), | +| updating the root node's gp_seq immediately would create unnecessary | +| lock contention. Here's why: | +| | +| If we did rcu_seq_start() directly on the root node's gp_seq: | +| | +| 1. All CPUs would immediately see their node's gp_seq from their rdp's| +| gp_seq, in rcu_pending(). They would all then invoke the RCU-core. | +| 2. Which calls note_gp_changes() and try to acquire the node lock. | +| 3. But rnp->qsmask isn't initialized yet (happens later in | +| rcu_gp_init()) | +| 4. So each CPU would acquire the lock, find it can't determine if it | +| needs to report quiescent state (no qsmask), update rdp->gp_seq, | +| and release the lock. | +| 5. Result: Lots of lock acquisitions with no grace period progress | +| | +| By having a separate rcu_state.gp_seq, we can increment the official | +| grace period counter without immediately affecting what CPUs see in | +| their nodes. The hierarchical propagation in rcu_gp_init() then | +| updates the root node's gp_seq and qsmask together under the same lock| +| acquisition, avoiding this useless contention. | ++-----------------------------------------------------------------------+ + Miscellaneous ''''''''''''' diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 384af6b5cf9c..b206db119546 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1845,6 +1845,10 @@ static noinline_for_stack bool rcu_gp_init(void) * use-after-free errors. For a detailed explanation of this race, see * Documentation/RCU/Design/Requirements/Requirements.rst in the * "Hotplug CPU" section. + * + * Also note that the root rnp's gp_seq is kept separate from, and lags, + * the rcu_state's gp_seq, for a reason. See the Quick-Quiz on + * Single-node systems for more details (in Data-Structures.rst). */ rcu_seq_start(&rcu_state.gp_seq); /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */ From 5d71c2b53f1790c2ca09d03848839c610653d278 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 15 Jul 2025 16:01:55 -0400 Subject: [PATCH 49/49] rcu: Document concurrent quiescent state reporting for offline CPUs The synchronization of CPU offlining with GP initialization is confusing to put it mildly (rightfully so as the issue it deals with is complex). Recent discussions brought up a question -- what prevents the rcu_implicit_dyntick_qs() from warning about QS reports for offline CPUs (missing QS reports for offline CPUs causing indefinite hangs). QS reporting for now-offline CPUs should only happen from: - gp_init() - rcutree_cpu_report_dead() Add some documentation on this and refer to it from comments in the code explaining how QS reporting is not missed when these functions are concurrently running. I referred heavily to this post [1] about the need for the ofl_lock. [1] https://lore.kernel.org/all/20180924164443.GF4222@linux.ibm.com/ [ Applied paulmck feedback on moving documentation to Requirements.rst ] Link: https://lore.kernel.org/all/01b4d228-9416-43f8-a62e-124b92e8741a@paulmck-laptop/ Co-developed-by: "Paul E. McKenney" Signed-off-by: "Paul E. McKenney" Reviewed-by: Frederic Weisbecker Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- .../RCU/Design/Requirements/Requirements.rst | 87 +++++++++++++++++++ kernel/rcu/tree.c | 19 +++- 2 files changed, 105 insertions(+), 1 deletion(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index 771a1ce4c84d..b0395540296b 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -2011,6 +2011,93 @@ after the CPU hotplug scanning. By incrementing gp_seq first, CPU1's RCU read-side critical section is guaranteed to not be missed by CPU2. +**Concurrent Quiescent State Reporting for Offline CPUs** + +RCU must ensure that CPUs going offline report quiescent states to avoid +blocking grace periods. This requires careful synchronization to handle +race conditions + +**Race condition causing Offline CPU to hang GP** + +A race between CPU offlining and new GP initialization (gp_init) may occur +because `rcu_report_qs_rnp()` in `rcutree_report_cpu_dead()` must temporarily +release the `rcu_node` lock to wake the RCU grace-period kthread: + +.. code-block:: none + + CPU1 (going offline) CPU0 (GP kthread) + -------------------- ----------------- + rcutree_report_cpu_dead() + rcu_report_qs_rnp() + // Must release rnp->lock to wake GP kthread + raw_spin_unlock_irqrestore_rcu_node() + // Wakes up and starts new GP + rcu_gp_init() + // First loop: + copies qsmaskinitnext->qsmaskinit + // CPU1 still in qsmaskinitnext! + + // Second loop: + rnp->qsmask = rnp->qsmaskinit + mask = rnp->qsmask & ~rnp->qsmaskinitnext + // mask is 0! CPU1 still in both masks + // Reacquire lock (but too late) + rnp->qsmaskinitnext &= ~mask // Finally clears bit + +Without `ofl_lock`, the new grace period includes the offline CPU and waits +forever for its quiescent state causing a GP hang. + +**A solution with ofl_lock** + +The `ofl_lock` (offline lock) prevents `rcu_gp_init()` from running during +the vulnerable window when `rcu_report_qs_rnp()` has released `rnp->lock`: + +.. code-block:: none + + CPU0 (rcu_gp_init) CPU1 (rcutree_report_cpu_dead) + ------------------ ------------------------------ + rcu_for_each_leaf_node(rnp) { + arch_spin_lock(&ofl_lock) -----> arch_spin_lock(&ofl_lock) [BLOCKED] + + // Safe: CPU1 can't interfere + rnp->qsmaskinit = rnp->qsmaskinitnext + + arch_spin_unlock(&ofl_lock) ---> // Now CPU1 can proceed + } // But snapshot already taken + +**Another race causing GP hangs in rcu_gpu_init(): Reporting QS for Now-offline CPUs** + +After the first loop takes an atomic snapshot of online CPUs, as shown above, +the second loop in `rcu_gp_init()` detects CPUs that went offline between +releasing `ofl_lock` and acquiring the per-node `rnp->lock`. This detection is +crucial because: + +1. The CPU might have gone offline after the snapshot but before the second loop +2. The offline CPU cannot report its own QS if it's already dead +3. Without this detection, the grace period would wait forever for CPUs that + are now offline. + +The second loop performs this detection safely: + +.. code-block:: none + + rcu_for_each_node_breadth_first(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->qsmask = rnp->qsmaskinit; // Apply the snapshot + + // Detect CPUs offline after snapshot + mask = rnp->qsmask & ~rnp->qsmaskinitnext; + + if (mask && rcu_is_leaf_node(rnp)) + rcu_report_qs_rnp(mask, ...) // Report QS for offline CPUs + } + +This approach ensures atomicity: quiescent state reporting for offline CPUs +happens either in `rcu_gp_init()` (second loop) or in `rcutree_report_cpu_dead()`, +never both and never neither. The `rnp->lock` held throughout the sequence +prevents races - `rcutree_report_cpu_dead()` also acquires this lock when +clearing `qsmaskinitnext`, ensuring mutual exclusion. + Scheduler and RCU ~~~~~~~~~~~~~~~~~ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b206db119546..dd282a984fec 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1886,6 +1886,10 @@ static noinline_for_stack bool rcu_gp_init(void) /* Exclude CPU hotplug operations. */ rcu_for_each_leaf_node(rnp) { local_irq_disable(); + /* + * Serialize with CPU offline. See Requirements.rst > Hotplug CPU > + * Concurrent Quiescent State Reporting for Offline CPUs. + */ arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1960,7 +1964,12 @@ static noinline_for_stack bool rcu_gp_init(void) trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - /* Quiescent states for tasks on any now-offline CPUs. */ + /* + * Quiescent states for tasks on any now-offline CPUs. Since we + * released the ofl and rnp lock before this loop, CPUs might + * have gone offline and we have to report QS on their behalf. + * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting. + */ mask = rnp->qsmask & ~rnp->qsmaskinitnext; rnp->rcu_gp_init_mask = mask; if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) @@ -4386,6 +4395,13 @@ void rcutree_report_cpu_dead(void) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; + + /* + * Hold the ofl_lock and rnp lock to avoid races between CPU going + * offline and doing a QS report (as below), versus rcu_gp_init(). + * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting section + * for more details. + */ arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); @@ -4396,6 +4412,7 @@ void rcutree_report_cpu_dead(void) rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } + /* Clear from ->qsmaskinitnext to mark offline. */ WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); arch_spin_unlock(&rcu_state.ofl_lock);