From 32471b2f481dea8624f27669d36ffd131d24b732 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:22 +0200 Subject: [PATCH 01/15] net: page_pool: Don't recycle into cache on PREEMPT_RT With preemptible softirq and no per-CPU locking in local_bh_disable() on PREEMPT_RT the consumer can be preempted while a skb is returned. Avoid the race by disabling the recycle into the cache on PREEMPT_RT. Cc: Jesper Dangaard Brouer Cc: Ilias Apalodimas Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-2-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- net/core/page_pool.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 2b7684865941..974f3eef2efa 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -839,6 +839,10 @@ static bool page_pool_napi_local(const struct page_pool *pool) const struct napi_struct *napi; u32 cpuid; + /* On PREEMPT_RT the softirq can be preempted by the consumer */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; + if (unlikely(!in_softirq())) return false; From c99dac52ffad5df88c9c52ab2008b182743bdcc1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:23 +0200 Subject: [PATCH 02/15] net: dst_cache: Use nested-BH locking for dst_cache::cache dst_cache::cache is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Add a local_lock_t to the data structure and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-3-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- net/core/dst_cache.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 70c634b9e7b0..93a04d18e505 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -17,6 +17,7 @@ struct dst_cache_pcpu { unsigned long refresh_ts; struct dst_entry *dst; + local_lock_t bh_lock; u32 cookie; union { struct in_addr in_saddr; @@ -65,10 +66,15 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) { + struct dst_entry *dst; + if (!dst_cache->cache) return NULL; - return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_lock_nested_bh(&dst_cache->cache->bh_lock); + dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst; } EXPORT_SYMBOL_GPL(dst_cache_get); @@ -80,12 +86,16 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) if (!dst_cache->cache) return NULL; + local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; + } *saddr = idst->in_saddr.s_addr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst_rtable(dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); @@ -98,9 +108,11 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, if (!dst_cache->cache) return; + local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, 0); idst->in_saddr.s_addr = saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip4); @@ -113,10 +125,13 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, if (!dst_cache->cache) return; + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, rt6_get_cookie(dst_rt6_info(dst))); idst->in6_saddr = *saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); @@ -129,12 +144,17 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, if (!dst_cache->cache) return NULL; + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; + } *saddr = idst->in6_saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst; } EXPORT_SYMBOL_GPL(dst_cache_get_ip6); @@ -142,10 +162,14 @@ EXPORT_SYMBOL_GPL(dst_cache_get_ip6); int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) { + unsigned int i; + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, gfp | __GFP_ZERO); if (!dst_cache->cache) return -ENOMEM; + for_each_possible_cpu(i) + local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock); dst_cache_reset(dst_cache); return 0; From 1c0829788a6e6e165846b9bedd0b908ef16260b6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:24 +0200 Subject: [PATCH 03/15] ipv4/route: Use this_cpu_inc() for stats on PREEMPT_RT The statistics are incremented with raw_cpu_inc() assuming it always happens with bottom half disabled. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this is no longer true. Use this_cpu_inc() on PREEMPT_RT for the increment to not worry about preemption. Cc: David Ahern Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-4-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- net/ipv4/route.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 507b2e5dec50..fccb05fb3a79 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -189,7 +189,11 @@ const __u8 ip_tos2prio[16] = { EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); +#ifndef CONFIG_PREEMPT_RT #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) +#else +#define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field) +#endif #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) From bc57eda646cea6a9077ba1b781bf64bc0ab836a7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:25 +0200 Subject: [PATCH 04/15] ipv6: sr: Use nested-BH locking for hmac_storage hmac_storage is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Add a local_lock_t to the data structure and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: David Ahern Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-5-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- net/ipv6/seg6_hmac.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index bbf5b84a70fc..f78ecb6ad838 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -40,7 +40,14 @@ #include #include -static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring); +struct hmac_storage { + local_lock_t bh_lock; + char hmac_ring[SEG6_HMAC_RING_SIZE]; +}; + +static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -187,7 +194,8 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, */ local_bh_disable(); - ring = this_cpu_ptr(hmac_ring); + local_lock_nested_bh(&hmac_storage.bh_lock); + ring = this_cpu_ptr(hmac_storage.hmac_ring); off = ring; /* source address */ @@ -212,6 +220,7 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, dgsize = __do_hmac(hinfo, ring, plen, tmp_out, SEG6_HMAC_MAX_DIGESTSIZE); + local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); if (dgsize < 0) From b9eef3391de028fdd88fd7a2f81a4834fc98c9ac Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:26 +0200 Subject: [PATCH 05/15] xdp: Use nested-BH locking for system_page_pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit system_page_pool is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Make a struct with a page_pool member (original system_page_pool) and a local_lock_t and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Andrew Lunn Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jesper Dangaard Brouer Cc: John Fastabend Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-6-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 7 ++++++- net/core/dev.c | 15 ++++++++++----- net/core/xdp.c | 15 ++++++++++----- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9e3a2d8452d6..73a97cf1bbce 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3503,7 +3503,12 @@ struct softnet_data { }; DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); -DECLARE_PER_CPU(struct page_pool *, system_page_pool); + +struct page_pool_bh { + struct page_pool *pool; + local_lock_t bh_lock; +}; +DECLARE_PER_CPU(struct page_pool_bh, system_page_pool); #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) diff --git a/net/core/dev.c b/net/core/dev.c index 33d5e95209cb..eea26162a585 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -462,7 +462,9 @@ EXPORT_PER_CPU_SYMBOL(softnet_data); * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ -DEFINE_PER_CPU(struct page_pool *, system_page_pool); +DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #ifdef CONFIG_LOCKDEP /* @@ -5322,7 +5324,10 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) struct sk_buff *skb = *pskb; int err, hroom, troom; - if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) + local_lock_nested_bh(&system_page_pool.bh_lock); + err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog); + local_unlock_nested_bh(&system_page_pool.bh_lock); + if (!err) return 0; /* In case we have to go down the path and also linearize, @@ -12712,7 +12717,7 @@ static int net_page_pool_create(int cpuid) return err; } - per_cpu(system_page_pool, cpuid) = pp_ptr; + per_cpu(system_page_pool.pool, cpuid) = pp_ptr; #endif return 0; } @@ -12842,13 +12847,13 @@ static int __init net_dev_init(void) for_each_possible_cpu(i) { struct page_pool *pp_ptr; - pp_ptr = per_cpu(system_page_pool, i); + pp_ptr = per_cpu(system_page_pool.pool, i); if (!pp_ptr) continue; xdp_unreg_page_pool(pp_ptr); page_pool_destroy(pp_ptr); - per_cpu(system_page_pool, i) = NULL; + per_cpu(system_page_pool.pool, i) = NULL; } } diff --git a/net/core/xdp.c b/net/core/xdp.c index 4e91c7790671..e6f22ba61c1e 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -739,25 +739,27 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, */ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) { - struct page_pool *pp = this_cpu_read(system_page_pool); const struct xdp_rxq_info *rxq = xdp->rxq; u32 len = xdp->data_end - xdp->data_meta; u32 truesize = xdp->frame_sz; - struct sk_buff *skb; + struct sk_buff *skb = NULL; + struct page_pool *pp; int metalen; void *data; if (!IS_ENABLED(CONFIG_PAGE_POOL)) return NULL; + local_lock_nested_bh(&system_page_pool.bh_lock); + pp = this_cpu_read(system_page_pool.pool); data = page_pool_dev_alloc_va(pp, &truesize); if (unlikely(!data)) - return NULL; + goto out; skb = napi_build_skb(data, truesize); if (unlikely(!skb)) { page_pool_free_va(pp, data, true); - return NULL; + goto out; } skb_mark_for_recycle(skb); @@ -776,13 +778,16 @@ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) if (unlikely(xdp_buff_has_frags(xdp)) && unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) { napi_consume_skb(skb, true); - return NULL; + skb = NULL; + goto out; } xsk_buff_free(xdp); skb->protocol = eth_type_trans(skb, rxq->dev); +out: + local_unlock_nested_bh(&system_page_pool.bh_lock); return skb; } EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc); From 9c607d4b6589d4d380a85784514bcf4cceee1e11 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:27 +0200 Subject: [PATCH 06/15] xfrm: Use nested-BH locking for nat_keepalive_sk_ipv[46] nat_keepalive_sk_ipv[46] is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Use sock_bh_locked which has a sock pointer and a local_lock_t. Use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Steffen Klassert Cc: Herbert Xu Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-7-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- net/xfrm/xfrm_nat_keepalive.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/net/xfrm/xfrm_nat_keepalive.c b/net/xfrm/xfrm_nat_keepalive.c index 82f0a301683f..ebf95d48e86c 100644 --- a/net/xfrm/xfrm_nat_keepalive.c +++ b/net/xfrm/xfrm_nat_keepalive.c @@ -9,9 +9,13 @@ #include #include -static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv4); +static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv4) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #if IS_ENABLED(CONFIG_IPV6) -static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv6); +static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv6) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #endif struct nat_keepalive { @@ -56,10 +60,12 @@ static int nat_keepalive_send_ipv4(struct sk_buff *skb, skb_dst_set(skb, &rt->dst); - sk = *this_cpu_ptr(&nat_keepalive_sk_ipv4); + local_lock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock); + sk = this_cpu_read(nat_keepalive_sk_ipv4.sock); sock_net_set(sk, net); err = ip_build_and_send_pkt(skb, sk, fl4.saddr, fl4.daddr, NULL, tos); sock_net_set(sk, &init_net); + local_unlock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock); return err; } @@ -89,15 +95,19 @@ static int nat_keepalive_send_ipv6(struct sk_buff *skb, fl6.fl6_sport = ka->encap_sport; fl6.fl6_dport = ka->encap_dport; - sk = *this_cpu_ptr(&nat_keepalive_sk_ipv6); + local_lock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); + sk = this_cpu_read(nat_keepalive_sk_ipv6.sock); sock_net_set(sk, net); dst = ipv6_stub->ipv6_dst_lookup_flow(net, sk, &fl6, NULL); - if (IS_ERR(dst)) + if (IS_ERR(dst)) { + local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); return PTR_ERR(dst); + } skb_dst_set(skb, dst); err = ipv6_stub->ip6_xmit(sk, skb, &fl6, skb->mark, NULL, 0, 0); sock_net_set(sk, &init_net); + local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); return err; } #endif @@ -202,7 +212,7 @@ static void nat_keepalive_work(struct work_struct *work) (ctx.next_run - ctx.now) * HZ); } -static int nat_keepalive_sk_init(struct sock * __percpu *socks, +static int nat_keepalive_sk_init(struct sock_bh_locked __percpu *socks, unsigned short family) { struct sock *sk; @@ -214,22 +224,22 @@ static int nat_keepalive_sk_init(struct sock * __percpu *socks, if (err < 0) goto err; - *per_cpu_ptr(socks, i) = sk; + per_cpu_ptr(socks, i)->sock = sk; } return 0; err: for_each_possible_cpu(i) - inet_ctl_sock_destroy(*per_cpu_ptr(socks, i)); + inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock); return err; } -static void nat_keepalive_sk_fini(struct sock * __percpu *socks) +static void nat_keepalive_sk_fini(struct sock_bh_locked __percpu *socks) { int i; for_each_possible_cpu(i) - inet_ctl_sock_destroy(*per_cpu_ptr(socks, i)); + inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock); } void xfrm_nat_keepalive_state_updated(struct xfrm_state *x) From 035fcdc4d240c873c89b76b752dd9921bc88c1ba Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:28 +0200 Subject: [PATCH 07/15] openvswitch: Merge three per-CPU structures into one exec_actions_level is a per-CPU integer allocated at compile time. action_fifos and flow_keys are per-CPU pointer and have their data allocated at module init time. There is no gain in splitting it, once the module is allocated, the structures are allocated. Merge the three per-CPU variables into ovs_pcpu_storage, adapt callers. Cc: Aaron Conole Cc: Eelco Chaudron Cc: Ilya Maximets Cc: dev@openvswitch.org Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Aaron Conole Link: https://patch.msgid.link/20250512092736.229935-8-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- net/openvswitch/actions.c | 49 +++++++++++++------------------------- net/openvswitch/datapath.c | 9 +------ net/openvswitch/datapath.h | 3 --- 3 files changed, 17 insertions(+), 44 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 2f22ca59586f..7e4a8d41b9ed 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -78,17 +78,22 @@ struct action_flow_keys { struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; }; -static struct action_fifo __percpu *action_fifos; -static struct action_flow_keys __percpu *flow_keys; -static DEFINE_PER_CPU(int, exec_actions_level); +struct ovs_pcpu_storage { + struct action_fifo action_fifos; + struct action_flow_keys flow_keys; + int exec_level; +}; + +static DEFINE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage); /* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys' * space. Return NULL if out of key spaces. */ static struct sw_flow_key *clone_key(const struct sw_flow_key *key_) { - struct action_flow_keys *keys = this_cpu_ptr(flow_keys); - int level = this_cpu_read(exec_actions_level); + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(&ovs_pcpu_storage); + struct action_flow_keys *keys = &ovs_pcpu->flow_keys; + int level = ovs_pcpu->exec_level; struct sw_flow_key *key = NULL; if (level <= OVS_DEFERRED_ACTION_THRESHOLD) { @@ -132,10 +137,9 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb, const struct nlattr *actions, const int actions_len) { - struct action_fifo *fifo; + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage.action_fifos); struct deferred_action *da; - fifo = this_cpu_ptr(action_fifos); da = action_fifo_put(fifo); if (da) { da->skb = skb; @@ -1608,13 +1612,13 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, if (actions) { /* Sample action */ if (clone_flow_key) - __this_cpu_inc(exec_actions_level); + __this_cpu_inc(ovs_pcpu_storage.exec_level); err = do_execute_actions(dp, skb, clone, actions, len); if (clone_flow_key) - __this_cpu_dec(exec_actions_level); + __this_cpu_dec(ovs_pcpu_storage.exec_level); } else { /* Recirc action */ clone->recirc_id = recirc_id; ovs_dp_process_packet(skb, clone); @@ -1650,7 +1654,7 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, static void process_deferred_actions(struct datapath *dp) { - struct action_fifo *fifo = this_cpu_ptr(action_fifos); + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage.action_fifos); /* Do not touch the FIFO in case there is no deferred actions. */ if (action_fifo_is_empty(fifo)) @@ -1681,7 +1685,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, { int err, level; - level = __this_cpu_inc_return(exec_actions_level); + level = __this_cpu_inc_return(ovs_pcpu_storage.exec_level); if (unlikely(level > OVS_RECURSION_LIMIT)) { net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", ovs_dp_name(dp)); @@ -1698,27 +1702,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, process_deferred_actions(dp); out: - __this_cpu_dec(exec_actions_level); + __this_cpu_dec(ovs_pcpu_storage.exec_level); return err; } - -int action_fifos_init(void) -{ - action_fifos = alloc_percpu(struct action_fifo); - if (!action_fifos) - return -ENOMEM; - - flow_keys = alloc_percpu(struct action_flow_keys); - if (!flow_keys) { - free_percpu(action_fifos); - return -ENOMEM; - } - - return 0; -} - -void action_fifos_exit(void) -{ - free_percpu(action_fifos); - free_percpu(flow_keys); -} diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 5d548eda742d..aaa6277bb49c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2729,13 +2729,9 @@ static int __init dp_init(void) pr_info("Open vSwitch switching datapath\n"); - err = action_fifos_init(); - if (err) - goto error; - err = ovs_internal_dev_rtnl_link_register(); if (err) - goto error_action_fifos_exit; + goto error; err = ovs_flow_init(); if (err) @@ -2778,8 +2774,6 @@ static int __init dp_init(void) ovs_flow_exit(); error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); -error_action_fifos_exit: - action_fifos_exit(); error: return err; } @@ -2795,7 +2789,6 @@ static void dp_cleanup(void) ovs_vport_exit(); ovs_flow_exit(); ovs_internal_dev_rtnl_link_unregister(); - action_fifos_exit(); } module_init(dp_init); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 384ca77f4e79..a12640792605 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -281,9 +281,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, void ovs_dp_notify_wq(struct work_struct *work); -int action_fifos_init(void); -void action_fifos_exit(void); - /* 'KEY' must not have any bits set outside of the 'MASK' */ #define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) #define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK)) From 672318331b44753ab7bd8545558939c38b4c1132 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:29 +0200 Subject: [PATCH 08/15] openvswitch: Use nested-BH locking for ovs_pcpu_storage ovs_pcpu_storage is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. The data structure can be referenced recursive and there is a recursion counter to avoid too many recursions. Add a local_lock_t to the data structure and use local_lock_nested_bh() for locking. Add an owner of the struct which is the current task and acquire the lock only if the structure is not owned by the current task. Cc: Aaron Conole Cc: Eelco Chaudron Cc: Ilya Maximets Cc: dev@openvswitch.org Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Aaron Conole Link: https://patch.msgid.link/20250512092736.229935-9-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- net/openvswitch/actions.c | 31 ++----------------------------- net/openvswitch/datapath.c | 24 ++++++++++++++++++++++++ net/openvswitch/datapath.h | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 29 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 7e4a8d41b9ed..435725c27a55 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -39,15 +39,6 @@ #include "flow_netlink.h" #include "openvswitch_trace.h" -struct deferred_action { - struct sk_buff *skb; - const struct nlattr *actions; - int actions_len; - - /* Store pkt_key clone when creating deferred action. */ - struct sw_flow_key pkt_key; -}; - #define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) struct ovs_frag_data { unsigned long dst; @@ -64,28 +55,10 @@ struct ovs_frag_data { static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); -#define DEFERRED_ACTION_FIFO_SIZE 10 -#define OVS_RECURSION_LIMIT 5 -#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) -struct action_fifo { - int head; - int tail; - /* Deferred action fifo queue storage. */ - struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; +DEFINE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; -struct action_flow_keys { - struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; -}; - -struct ovs_pcpu_storage { - struct action_fifo action_fifos; - struct action_flow_keys flow_keys; - int exec_level; -}; - -static DEFINE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage); - /* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys' * space. Return NULL if out of key spaces. */ diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index aaa6277bb49c..6a304ae2d959 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -244,11 +244,13 @@ void ovs_dp_detach_port(struct vport *p) /* Must be called with rcu_read_lock. */ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(&ovs_pcpu_storage); const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; + bool ovs_pcpu_locked = false; u64 *stats_counter; u32 n_mask_hit; u32 n_cache_hit; @@ -290,10 +292,26 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) ovs_flow_stats_update(flow, key->tp.flags, skb); sf_acts = rcu_dereference(flow->sf_acts); + /* This path can be invoked recursively: Use the current task to + * identify recursive invocation - the lock must be acquired only once. + * Even with disabled bottom halves this can be preempted on PREEMPT_RT. + * Limit the locking to RT to avoid assigning `owner' if it can be + * avoided. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && ovs_pcpu->owner != current) { + local_lock_nested_bh(&ovs_pcpu_storage.bh_lock); + ovs_pcpu->owner = current; + ovs_pcpu_locked = true; + } + error = ovs_execute_actions(dp, skb, sf_acts, key); if (unlikely(error)) net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", ovs_dp_name(dp), error); + if (ovs_pcpu_locked) { + ovs_pcpu->owner = NULL; + local_unlock_nested_bh(&ovs_pcpu_storage.bh_lock); + } stats_counter = &stats->n_hit; @@ -671,7 +689,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) sf_acts = rcu_dereference(flow->sf_acts); local_bh_disable(); + local_lock_nested_bh(&ovs_pcpu_storage.bh_lock); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_write(ovs_pcpu_storage.owner, current); err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_write(ovs_pcpu_storage.owner, NULL); + local_unlock_nested_bh(&ovs_pcpu_storage.bh_lock); local_bh_enable(); rcu_read_unlock(); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index a12640792605..4a665c3cfa90 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -173,6 +173,39 @@ struct ovs_net { bool xt_label; }; +struct deferred_action { + struct sk_buff *skb; + const struct nlattr *actions; + int actions_len; + + /* Store pkt_key clone when creating deferred action. */ + struct sw_flow_key pkt_key; +}; + +#define DEFERRED_ACTION_FIFO_SIZE 10 +#define OVS_RECURSION_LIMIT 5 +#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) + +struct action_fifo { + int head; + int tail; + /* Deferred action fifo queue storage. */ + struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; +}; + +struct action_flow_keys { + struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; +}; + +struct ovs_pcpu_storage { + struct action_fifo action_fifos; + struct action_flow_keys flow_keys; + int exec_level; + struct task_struct *owner; + local_lock_t bh_lock; +}; +DECLARE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage); + /** * enum ovs_pkt_hash_types - hash info to include with a packet * to send to userspace. From 3af4cdd67f32529c177b885d4ca491710e961928 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:30 +0200 Subject: [PATCH 09/15] openvswitch: Move ovs_frag_data_storage into the struct ovs_pcpu_storage ovs_frag_data_storage is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Move ovs_frag_data_storage into the struct ovs_pcpu_storage which already provides locking for the structure. Cc: Aaron Conole Cc: Eelco Chaudron Cc: Ilya Maximets Cc: dev@openvswitch.org Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Aaron Conole Link: https://patch.msgid.link/20250512092736.229935-10-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- net/openvswitch/actions.c | 20 ++------------------ net/openvswitch/datapath.h | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 435725c27a55..e7269a3eec79 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -39,22 +39,6 @@ #include "flow_netlink.h" #include "openvswitch_trace.h" -#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) -struct ovs_frag_data { - unsigned long dst; - struct vport *vport; - struct ovs_skb_cb cb; - __be16 inner_protocol; - u16 network_offset; /* valid only for MPLS */ - u16 vlan_tci; - __be16 vlan_proto; - unsigned int l2_len; - u8 mac_proto; - u8 l2_data[MAX_L2_LEN]; -}; - -static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); - DEFINE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; @@ -771,7 +755,7 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); + struct ovs_frag_data *data = this_cpu_ptr(&ovs_pcpu_storage.frag_data); struct vport *vport = data->vport; if (skb_cow_head(skb, data->l2_len) < 0) { @@ -823,7 +807,7 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb, unsigned int hlen = skb_network_offset(skb); struct ovs_frag_data *data; - data = this_cpu_ptr(&ovs_frag_data_storage); + data = this_cpu_ptr(&ovs_pcpu_storage.frag_data); data->dst = skb->_skb_refdst; data->vport = vport; data->cb = *OVS_CB(skb); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 4a665c3cfa90..1b5348b0f559 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "conntrack.h" #include "flow.h" @@ -173,6 +174,20 @@ struct ovs_net { bool xt_label; }; +#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) +struct ovs_frag_data { + unsigned long dst; + struct vport *vport; + struct ovs_skb_cb cb; + __be16 inner_protocol; + u16 network_offset; /* valid only for MPLS */ + u16 vlan_tci; + __be16 vlan_proto; + unsigned int l2_len; + u8 mac_proto; + u8 l2_data[MAX_L2_LEN]; +}; + struct deferred_action { struct sk_buff *skb; const struct nlattr *actions; @@ -200,6 +215,7 @@ struct action_flow_keys { struct ovs_pcpu_storage { struct action_fifo action_fifos; struct action_flow_keys flow_keys; + struct ovs_frag_data frag_data; int exec_level; struct task_struct *owner; local_lock_t bh_lock; From 7fe70c06a182a140be9996b02256d907e114479a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:31 +0200 Subject: [PATCH 10/15] net/sched: act_mirred: Move the recursion counter struct netdev_xmit mirred_nest_level is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Move mirred_nest_level to struct netdev_xmit as u8, provide wrappers. Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Juri Lelli Link: https://patch.msgid.link/20250512092736.229935-11-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- include/linux/netdevice_xmit.h | 3 +++ net/sched/act_mirred.c | 28 +++++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 38325e070296..848735b3a7c0 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -8,6 +8,9 @@ struct netdev_xmit { #ifdef CONFIG_NET_EGRESS u8 skip_txqueue; #endif +#if IS_ENABLED(CONFIG_NET_ACT_MIRRED) + u8 sched_mirred_nest; +#endif }; #endif diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5b3814365924..5f01f567c934 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,7 +30,29 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); #define MIRRED_NEST_LIMIT 4 -static DEFINE_PER_CPU(unsigned int, mirred_nest_level); + +#ifndef CONFIG_PREEMPT_RT +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest); +} + +static void tcf_mirred_nest_level_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.sched_mirred_nest); +} + +#else +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return current->net_xmit.sched_mirred_nest++; +} + +static void tcf_mirred_nest_level_dec(void) +{ + current->net_xmit.sched_mirred_nest--; +} +#endif static bool tcf_mirred_is_act_redirect(int action) { @@ -423,7 +445,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, int m_eaction; u32 blockid; - nest_level = __this_cpu_inc_return(mirred_nest_level); + nest_level = tcf_mirred_nest_level_inc_return(); if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); @@ -454,7 +476,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, retval); dec_nest_level: - __this_cpu_dec(mirred_nest_level); + tcf_mirred_nest_level_dec(); return retval; } From 20d677d389e7df6963ca9a41cd0f88954a65ba7b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:32 +0200 Subject: [PATCH 11/15] net/sched: Use nested-BH locking for sch_frag_data_storage sch_frag_data_storage is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Add local_lock_t to the struct and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-12-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- net/sched/sch_frag.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c index ce63414185fd..d1d87dce7f3f 100644 --- a/net/sched/sch_frag.c +++ b/net/sched/sch_frag.c @@ -16,14 +16,18 @@ struct sch_frag_data { unsigned int l2_len; u8 l2_data[VLAN_ETH_HLEN]; int (*xmit)(struct sk_buff *skb); + local_lock_t bh_lock; }; -static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage); +static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage); + lockdep_assert_held(&data->bh_lock); if (skb_cow_head(skb, data->l2_len) < 0) { kfree_skb(skb); return -ENOMEM; @@ -95,6 +99,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, struct rtable sch_frag_rt = { 0 }; unsigned long orig_dst; + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); sch_frag_prepare_frag(skb, xmit); dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT); @@ -105,11 +110,13 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, IPCB(skb)->frag_max_size = mru; ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit); + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); refdst_drop(orig_dst); } else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) { unsigned long orig_dst; struct rt6_info sch_frag_rt; + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); sch_frag_prepare_frag(skb, xmit); memset(&sch_frag_rt, 0, sizeof(sch_frag_rt)); dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, @@ -122,6 +129,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb, sch_frag_xmit); + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); refdst_drop(orig_dst); } else { net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n", From 82d9e6b9a0a164719a8df7584d9f7e42de443698 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:33 +0200 Subject: [PATCH 12/15] mptcp: Use nested-BH locking for hmac_storage mptcp_delegated_actions is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Add a local_lock_t to the data structure and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Matthieu Baerts Cc: Mat Martineau Cc: Geliang Tang Cc: mptcp@lists.linux.dev Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-13-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- net/mptcp/protocol.c | 4 +++- net/mptcp/protocol.h | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c4fd558307f2..0749733ea897 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -46,7 +46,9 @@ static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_sm static void __mptcp_destroy_sock(struct sock *sk); static void mptcp_check_send_data_fin(struct sock *sk); -DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); +DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static struct net_device *mptcp_napi_dev; /* Returns end sequence number of the receiver's advertised window */ diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 7aa38d74fef6..3dd11dd3ba16 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -479,6 +479,7 @@ mptcp_subflow_rsk(const struct request_sock *rsk) struct mptcp_delegated_action { struct napi_struct napi; + local_lock_t bh_lock; struct list_head head; }; @@ -670,9 +671,11 @@ static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node))) return; + local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); delegated = this_cpu_ptr(&mptcp_delegated_actions); schedule = list_empty(&delegated->head); list_add_tail(&subflow->delegated_node, &delegated->head); + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); sock_hold(mptcp_subflow_tcp_sock(subflow)); if (schedule) napi_schedule(&delegated->napi); @@ -684,11 +687,15 @@ mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) { struct mptcp_subflow_context *ret; - if (list_empty(&delegated->head)) + local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); + if (list_empty(&delegated->head)) { + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return NULL; + } ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node); list_del_init(&ret->delegated_node); + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return ret; } From aaaaa6639cf56eac4c5e58981e7d1b279ba9f4c9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:34 +0200 Subject: [PATCH 13/15] rds: Disable only bottom halves in rds_page_remainder_alloc() rds_page_remainder_alloc() is invoked from a preemptible context or a tasklet. There is no need to disable interrupts for locking. Use local_bh_disable() instead of local_irq_save() for locking. Cc: Allison Henderson Cc: linux-rdma@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-14-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- net/rds/page.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/rds/page.c b/net/rds/page.c index 7cc57e098ddb..e0dd4f62ea47 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -69,7 +69,6 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, gfp_t gfp) { struct rds_page_remainder *rem; - unsigned long flags; struct page *page; int ret; @@ -88,7 +87,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, } rem = &per_cpu(rds_page_remainders, get_cpu()); - local_irq_save(flags); + local_bh_disable(); while (1) { /* avoid a tiny region getting stuck by tossing it */ @@ -116,13 +115,13 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, } /* alloc if there is nothing for us to use */ - local_irq_restore(flags); + local_bh_enable(); put_cpu(); page = alloc_page(gfp); rem = &per_cpu(rds_page_remainders, get_cpu()); - local_irq_save(flags); + local_bh_disable(); if (!page) { ret = -ENOMEM; @@ -140,7 +139,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, rem->r_offset = 0; } - local_irq_restore(flags); + local_bh_enable(); put_cpu(); out: rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, From 0af5928f358c40c1fe5ede79f66f040e23124044 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:35 +0200 Subject: [PATCH 14/15] rds: Acquire per-CPU pointer within BH disabled section rds_page_remainder_alloc() obtains the current CPU with get_cpu() while disabling preemption. Then the CPU number is used to access the per-CPU data structure via per_cpu(). This can be optimized by relying on local_bh_disable() to provide a stable CPU number/ avoid migration and then using this_cpu_ptr() to retrieve the data structure. Cc: Allison Henderson Cc: linux-rdma@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-15-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- net/rds/page.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/rds/page.c b/net/rds/page.c index e0dd4f62ea47..58a8548a915a 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -86,8 +86,8 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, goto out; } - rem = &per_cpu(rds_page_remainders, get_cpu()); local_bh_disable(); + rem = this_cpu_ptr(&rds_page_remainders); while (1) { /* avoid a tiny region getting stuck by tossing it */ @@ -116,12 +116,11 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, /* alloc if there is nothing for us to use */ local_bh_enable(); - put_cpu(); page = alloc_page(gfp); - rem = &per_cpu(rds_page_remainders, get_cpu()); local_bh_disable(); + rem = this_cpu_ptr(&rds_page_remainders); if (!page) { ret = -ENOMEM; @@ -140,7 +139,6 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, } local_bh_enable(); - put_cpu(); out: rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, From c50d295c37f2648a8d9e8a572fedaad027d134bb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:36 +0200 Subject: [PATCH 15/15] rds: Use nested-BH locking for rds_page_remainder rds_page_remainder is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Add a local_lock_t to the data structure and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Allison Henderson Cc: linux-rdma@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-16-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- net/rds/page.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/rds/page.c b/net/rds/page.c index 58a8548a915a..afb151eac271 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -40,10 +40,12 @@ struct rds_page_remainder { struct page *r_page; unsigned long r_offset; + local_lock_t bh_lock; }; -static -DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; /** * rds_page_remainder_alloc - build up regions of a message. @@ -87,6 +89,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, } local_bh_disable(); + local_lock_nested_bh(&rds_page_remainders.bh_lock); rem = this_cpu_ptr(&rds_page_remainders); while (1) { @@ -115,11 +118,13 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, } /* alloc if there is nothing for us to use */ + local_unlock_nested_bh(&rds_page_remainders.bh_lock); local_bh_enable(); page = alloc_page(gfp); local_bh_disable(); + local_lock_nested_bh(&rds_page_remainders.bh_lock); rem = this_cpu_ptr(&rds_page_remainders); if (!page) { @@ -138,6 +143,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, rem->r_offset = 0; } + local_unlock_nested_bh(&rds_page_remainders.bh_lock); local_bh_enable(); out: rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,