Merge branch 'locking-introduce-nested-bh-locking'

Sebastian Andrzej Siewior says:

====================
locking: Introduce nested-BH locking.

Disabling bottoms halves acts as per-CPU BKL. On PREEMPT_RT code within
local_bh_disable() section remains preemtible. As a result high prior
tasks (or threaded interrupts) will be blocked by lower-prio task (or
threaded interrupts) which are long running which includes softirq
sections.

The proposed way out is to introduce explicit per-CPU locks for
resources which are protected by local_bh_disable() and use those only
on PREEMPT_RT so there is no additional overhead for !PREEMPT_RT builds.

The series introduces the infrastructure and converts large parts of
networking which is largest stake holder here. Once this done the
per-CPU lock from local_bh_disable() on PREEMPT_RT can be lifted.

Performance testing. Baseline is net-next as of commit 93bda33046
("Merge branch'net-constify-ctl_table-arguments-of-utility-functions'")
plus v6.10-rc1. A 10GiG link is used between two hosts. The command
   xdp-bench redirect-cpu --cpu 3 --remote-action drop eth1 -e

was invoked on the receiving side with a ixgbe. The sending side uses
pktgen_sample03_burst_single_flow.sh on i40e.

Baseline:
| eth1->?                 9,018,604 rx/s                  0 err,drop/s
|   receive total         9,018,604 pkt/s                 0 drop/s                0 error/s
|     cpu:7               9,018,604 pkt/s                 0 drop/s                0 error/s
|   enqueue to cpu 3      9,018,602 pkt/s                 0 drop/s             7.00 bulk-avg
|     cpu:7->3            9,018,602 pkt/s                 0 drop/s             7.00 bulk-avg
|   kthread total         9,018,606 pkt/s                 0 drop/s          214,698 sched
|     cpu:3               9,018,606 pkt/s                 0 drop/s          214,698 sched
|     xdp_stats                   0 pass/s        9,018,606 drop/s                0 redir/s
|       cpu:3                     0 pass/s        9,018,606 drop/s                0 redir/s
|   redirect_err                  0 error/s
|   xdp_exception                 0 hit/s

perf top --sort cpu,symbol --no-children:
|   18.14%  007  [k] bpf_prog_4f0ffbb35139c187_cpumap_l4_hash
|   13.29%  007  [k] ixgbe_poll
|   12.66%  003  [k] cpu_map_kthread_run
|    7.23%  003  [k] page_frag_free
|    6.76%  007  [k] xdp_do_redirect
|    3.76%  007  [k] cpu_map_redirect
|    3.13%  007  [k] bq_flush_to_queue
|    2.51%  003  [k] xdp_return_frame
|    1.93%  007  [k] try_to_wake_up
|    1.78%  007  [k] _raw_spin_lock
|    1.74%  007  [k] cpu_map_enqueue
|    1.56%  003  [k] bpf_prog_57cd311f2e27366b_cpumap_drop

With this series applied:
| eth1->?                10,329,340 rx/s                  0 err,drop/s
|   receive total        10,329,340 pkt/s                 0 drop/s                0 error/s
|     cpu:6              10,329,340 pkt/s                 0 drop/s                0 error/s
|   enqueue to cpu 3     10,329,338 pkt/s                 0 drop/s             8.00 bulk-avg
|     cpu:6->3           10,329,338 pkt/s                 0 drop/s             8.00 bulk-avg
|   kthread total        10,329,321 pkt/s                 0 drop/s           96,297 sched
|     cpu:3              10,329,321 pkt/s                 0 drop/s           96,297 sched
|     xdp_stats                   0 pass/s       10,329,321 drop/s                0 redir/s
|       cpu:3                     0 pass/s       10,329,321 drop/s                0 redir/s
|   redirect_err                  0 error/s
|   xdp_exception                 0 hit/s

perf top --sort cpu,symbol --no-children:
|   20.90%  006  [k] bpf_prog_4f0ffbb35139c187_cpumap_l4_hash
|   12.62%  006  [k] ixgbe_poll
|    9.82%  003  [k] page_frag_free
|    8.73%  003  [k] cpu_map_bpf_prog_run_xdp
|    6.63%  006  [k] xdp_do_redirect
|    4.94%  003  [k] cpu_map_kthread_run
|    4.28%  006  [k] cpu_map_redirect
|    4.03%  006  [k] bq_flush_to_queue
|    3.01%  003  [k] xdp_return_frame
|    1.95%  006  [k] _raw_spin_lock
|    1.94%  003  [k] bpf_prog_57cd311f2e27366b_cpumap_drop

This diff appears to be noise.

v8: https://lore.kernel.org/all/20240619072253.504963-1-bigeasy@linutronix.de
v7: https://lore.kernel.org/all/20240618072526.379909-1-bigeasy@linutronix.de
v6: https://lore.kernel.org/all/20240612170303.3896084-1-bigeasy@linutronix.de
v5: https://lore.kernel.org/all/20240607070427.1379327-1-bigeasy@linutronix.de
v4: https://lore.kernel.org/all/20240604154425.878636-1-bigeasy@linutronix.de
v3: https://lore.kernel.org/all/20240529162927.403425-1-bigeasy@linutronix.de
v2: https://lore.kernel.org/all/20240503182957.1042122-1-bigeasy@linutronix.de
v1: https://lore.kernel.org/all/20231215171020.687342-1-bigeasy@linutronix.de
====================

Link: https://patch.msgid.link/20240620132727.660738-1-bigeasy@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski
2024-06-24 16:41:25 -07:00
24 changed files with 425 additions and 131 deletions

View File

@@ -733,21 +733,101 @@ struct bpf_nh_params {
};
};
/* flags for bpf_redirect_info kern_flags */
#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */
#define BPF_RI_F_RI_INIT BIT(1)
#define BPF_RI_F_CPU_MAP_INIT BIT(2)
#define BPF_RI_F_DEV_MAP_INIT BIT(3)
#define BPF_RI_F_XSK_MAP_INIT BIT(4)
struct bpf_redirect_info {
u64 tgt_index;
void *tgt_value;
struct bpf_map *map;
u32 flags;
u32 kern_flags;
u32 map_id;
enum bpf_map_type map_type;
struct bpf_nh_params nh;
u32 kern_flags;
};
DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
struct bpf_net_context {
struct bpf_redirect_info ri;
struct list_head cpu_map_flush_list;
struct list_head dev_map_flush_list;
struct list_head xskmap_map_flush_list;
};
/* flags for bpf_redirect_info kern_flags */
#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */
static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx)
{
struct task_struct *tsk = current;
if (tsk->bpf_net_context != NULL)
return NULL;
bpf_net_ctx->ri.kern_flags = 0;
tsk->bpf_net_context = bpf_net_ctx;
return bpf_net_ctx;
}
static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx)
{
if (bpf_net_ctx)
current->bpf_net_context = NULL;
}
static inline struct bpf_net_context *bpf_net_ctx_get(void)
{
return current->bpf_net_context;
}
static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void)
{
struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) {
memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh));
bpf_net_ctx->ri.kern_flags |= BPF_RI_F_RI_INIT;
}
return &bpf_net_ctx->ri;
}
static inline struct list_head *bpf_net_ctx_get_cpu_map_flush_list(void)
{
struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_CPU_MAP_INIT)) {
INIT_LIST_HEAD(&bpf_net_ctx->cpu_map_flush_list);
bpf_net_ctx->ri.kern_flags |= BPF_RI_F_CPU_MAP_INIT;
}
return &bpf_net_ctx->cpu_map_flush_list;
}
static inline struct list_head *bpf_net_ctx_get_dev_flush_list(void)
{
struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_DEV_MAP_INIT)) {
INIT_LIST_HEAD(&bpf_net_ctx->dev_map_flush_list);
bpf_net_ctx->ri.kern_flags |= BPF_RI_F_DEV_MAP_INIT;
}
return &bpf_net_ctx->dev_map_flush_list;
}
static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void)
{
struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_XSK_MAP_INIT)) {
INIT_LIST_HEAD(&bpf_net_ctx->xskmap_map_flush_list);
bpf_net_ctx->ri.kern_flags |= BPF_RI_F_XSK_MAP_INIT;
}
return &bpf_net_ctx->xskmap_map_flush_list;
}
/* Compute the linear packet data range [data, data_end) which
* will be accessed by various program types (cls_bpf, act_bpf,
@@ -1018,25 +1098,23 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len);
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);
void bpf_clear_redirect_map(struct bpf_map *map);
static inline bool xdp_return_frame_no_direct(void)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
}
static inline void xdp_set_return_frame_no_direct(void)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
}
static inline void xdp_clear_return_frame_no_direct(void)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
}
@@ -1592,7 +1670,7 @@ static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 inde
u64 flags, const u64 flag_mask,
void *lookup_elem(struct bpf_map *map, u32 key))
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;
/* Lower bits of the flags are used as return code on lookup failure */

View File

@@ -51,4 +51,25 @@
#define local_unlock_irqrestore(lock, flags) \
__local_unlock_irqrestore(lock, flags)
DEFINE_GUARD(local_lock, local_lock_t __percpu*,
local_lock(_T),
local_unlock(_T))
DEFINE_GUARD(local_lock_irq, local_lock_t __percpu*,
local_lock_irq(_T),
local_unlock_irq(_T))
DEFINE_LOCK_GUARD_1(local_lock_irqsave, local_lock_t __percpu,
local_lock_irqsave(_T->lock, _T->flags),
local_unlock_irqrestore(_T->lock, _T->flags),
unsigned long flags)
#define local_lock_nested_bh(_lock) \
__local_lock_nested_bh(_lock)
#define local_unlock_nested_bh(_lock) \
__local_unlock_nested_bh(_lock)
DEFINE_GUARD(local_lock_nested_bh, local_lock_t __percpu*,
local_lock_nested_bh(_T),
local_unlock_nested_bh(_T))
#endif

View File

@@ -62,6 +62,17 @@ do { \
local_lock_debug_init(lock); \
} while (0)
#define __spinlock_nested_bh_init(lock) \
do { \
static struct lock_class_key __key; \
\
debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, \
0, LD_WAIT_CONFIG, LD_WAIT_INV, \
LD_LOCK_NORMAL); \
local_lock_debug_init(lock); \
} while (0)
#define __local_lock(lock) \
do { \
preempt_disable(); \
@@ -98,6 +109,15 @@ do { \
local_irq_restore(flags); \
} while (0)
#define __local_lock_nested_bh(lock) \
do { \
lockdep_assert_in_softirq(); \
local_lock_acquire(this_cpu_ptr(lock)); \
} while (0)
#define __local_unlock_nested_bh(lock) \
local_lock_release(this_cpu_ptr(lock))
#else /* !CONFIG_PREEMPT_RT */
/*
@@ -138,4 +158,15 @@ typedef spinlock_t local_lock_t;
#define __local_unlock_irqrestore(lock, flags) __local_unlock(lock)
#define __local_lock_nested_bh(lock) \
do { \
lockdep_assert_in_softirq_func(); \
spin_lock(this_cpu_ptr(lock)); \
} while (0)
#define __local_unlock_nested_bh(lock) \
do { \
spin_unlock(this_cpu_ptr((lock))); \
} while (0)
#endif /* CONFIG_PREEMPT_RT */

View File

@@ -600,6 +600,8 @@ do { \
(!in_softirq() || in_irq() || in_nmi())); \
} while (0)
extern void lockdep_assert_in_softirq_func(void);
#else
# define might_lock(lock) do { } while (0)
# define might_lock_read(lock) do { } while (0)
@@ -613,6 +615,7 @@ do { \
# define lockdep_assert_preemption_enabled() do { } while (0)
# define lockdep_assert_preemption_disabled() do { } while (0)
# define lockdep_assert_in_softirq() do { } while (0)
# define lockdep_assert_in_softirq_func() do { } while (0)
#endif
#ifdef CONFIG_PROVE_RAW_LOCK_NESTING

View File

@@ -43,6 +43,7 @@
#include <linux/netdev_features.h>
#include <linux/neighbour.h>
#include <linux/netdevice_xmit.h>
#include <uapi/linux/netdevice.h>
#include <uapi/linux/if_bonding.h>
#include <uapi/linux/pkt_cls.h>
@@ -3201,6 +3202,7 @@ static inline bool dev_has_header(const struct net_device *dev)
struct softnet_data {
struct list_head poll_list;
struct sk_buff_head process_queue;
local_lock_t process_queue_bh_lock;
/* stats */
unsigned int processed;
@@ -3223,13 +3225,7 @@ struct softnet_data {
struct sk_buff_head xfrm_backlog;
#endif
/* written and read only by owning cpu: */
struct {
u16 recursion;
u8 more;
#ifdef CONFIG_NET_EGRESS
u8 skip_txqueue;
#endif
} xmit;
struct netdev_xmit xmit;
#ifdef CONFIG_RPS
/* input_queue_head should be written by cpu owning this struct,
* and only read by other cpus. Worth using a cache line.
@@ -3257,10 +3253,18 @@ struct softnet_data {
DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
#ifndef CONFIG_PREEMPT_RT
static inline int dev_recursion_level(void)
{
return this_cpu_read(softnet_data.xmit.recursion);
}
#else
static inline int dev_recursion_level(void)
{
return current->net_xmit.recursion;
}
#endif
void __netif_schedule(struct Qdisc *q);
void netif_schedule_queue(struct netdev_queue *txq);
@@ -4872,18 +4876,35 @@ static inline ktime_t netdev_get_tstamp(struct net_device *dev,
return hwtstamps->hwtstamp;
}
static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
struct sk_buff *skb, struct net_device *dev,
bool more)
#ifndef CONFIG_PREEMPT_RT
static inline void netdev_xmit_set_more(bool more)
{
__this_cpu_write(softnet_data.xmit.more, more);
return ops->ndo_start_xmit(skb, dev);
}
static inline bool netdev_xmit_more(void)
{
return __this_cpu_read(softnet_data.xmit.more);
}
#else
static inline void netdev_xmit_set_more(bool more)
{
current->net_xmit.more = more;
}
static inline bool netdev_xmit_more(void)
{
return current->net_xmit.more;
}
#endif
static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
struct sk_buff *skb, struct net_device *dev,
bool more)
{
netdev_xmit_set_more(more);
return ops->ndo_start_xmit(skb, dev);
}
static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq, bool more)

View File

@@ -0,0 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _LINUX_NETDEVICE_XMIT_H
#define _LINUX_NETDEVICE_XMIT_H
struct netdev_xmit {
u16 recursion;
u8 more;
#ifdef CONFIG_NET_EGRESS
u8 skip_txqueue;
#endif
};
#endif

View File

@@ -36,6 +36,7 @@
#include <linux/signal_types.h>
#include <linux/syscall_user_dispatch_types.h>
#include <linux/mm_types_task.h>
#include <linux/netdevice_xmit.h>
#include <linux/task_io_accounting.h>
#include <linux/posix-timers_types.h>
#include <linux/restart_block.h>
@@ -53,6 +54,7 @@ struct bio_list;
struct blk_plug;
struct bpf_local_storage;
struct bpf_run_ctx;
struct bpf_net_context;
struct capture_control;
struct cfs_rq;
struct fs_struct;
@@ -975,7 +977,9 @@ struct task_struct {
/* delay due to memory thrashing */
unsigned in_thrashing:1;
#endif
#ifdef CONFIG_PREEMPT_RT
struct netdev_xmit net_xmit;
#endif
unsigned long atomic_flags; /* Flags requiring atomic access. */
struct restart_block restart_block;
@@ -1506,6 +1510,8 @@ struct task_struct {
/* Used for BPF run context */
struct bpf_run_ctx *bpf_ctx;
#endif
/* Used by BPF for per-TASK xdp storage */
struct bpf_net_context *bpf_net_context;
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
unsigned long lowest_stack;

View File

@@ -19,6 +19,7 @@ extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
extern bool seg6_bpf_has_valid_srh(struct sk_buff *skb);
struct seg6_bpf_srh_state {
local_lock_t bh_lock;
struct ipv6_sr_hdr *srh;
u16 hdrlen;
bool valid;

View File

@@ -544,6 +544,11 @@ struct sock {
netns_tracker ns_tracker;
};
struct sock_bh_locked {
struct sock *sock;
local_lock_t bh_lock;
};
enum sk_pacing {
SK_PACING_NONE = 0,
SK_PACING_NEEDED = 1,

View File

@@ -79,8 +79,6 @@ struct bpf_cpu_map {
struct bpf_cpu_map_entry __rcu **cpu_map;
};
static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
@@ -240,12 +238,14 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
int xdp_n, struct xdp_cpumap_stats *stats,
struct list_head *list)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int nframes;
if (!rcpu->prog)
return xdp_n;
rcu_read_lock_bh();
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
@@ -255,6 +255,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
if (unlikely(!list_empty(list)))
cpu_map_bpf_prog_run_skb(rcpu, list, stats);
bpf_net_ctx_clear(bpf_net_ctx);
rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
return nframes;
@@ -706,7 +707,7 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
*/
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list();
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
@@ -758,7 +759,7 @@ int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
void __cpu_map_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list();
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -772,20 +773,9 @@ void __cpu_map_flush(void)
#ifdef CONFIG_DEBUG_NET
bool cpu_map_check_flush(void)
{
if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
if (list_empty(bpf_net_ctx_get_cpu_map_flush_list()))
return false;
__cpu_map_flush();
return true;
}
#endif
static int __init cpu_map_init(void)
{
int cpu;
for_each_possible_cpu(cpu)
INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
return 0;
}
subsys_initcall(cpu_map_init);

View File

@@ -83,7 +83,6 @@ struct bpf_dtab {
u32 n_buckets;
};
static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
@@ -196,7 +195,14 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
bpf_clear_redirect_map(map);
/* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map()
* during NAPI callback and cleared after the XDP redirect. There is no
* explicit RCU read section which protects bpf_redirect_info->map but
* local_bh_disable() also marks the beginning an RCU section. This
* makes the complete softirq callback RCU protected. Thus after
* following synchronize_rcu() there no bpf_redirect_info->map == map
* assignment.
*/
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -408,7 +414,7 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
*/
void __dev_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list();
struct xdp_dev_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -422,7 +428,7 @@ void __dev_flush(void)
#ifdef CONFIG_DEBUG_NET
bool dev_check_flush(void)
{
if (list_empty(this_cpu_ptr(&dev_flush_list)))
if (list_empty(bpf_net_ctx_get_dev_flush_list()))
return false;
__dev_flush();
return true;
@@ -453,7 +459,7 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list();
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
@@ -1153,15 +1159,11 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
int cpu;
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
for_each_possible_cpu(cpu)
INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
return 0;
}

View File

@@ -2355,6 +2355,7 @@ __latent_entropy struct task_struct *copy_process(
RCU_INIT_POINTER(p->bpf_storage, NULL);
p->bpf_ctx = NULL;
#endif
p->bpf_net_context = NULL;
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);

View File

@@ -413,3 +413,11 @@ notrace int in_lock_functions(unsigned long addr)
&& addr < (unsigned long)__lock_text_end;
}
EXPORT_SYMBOL(in_lock_functions);
#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_PREEMPT_RT)
void notrace lockdep_assert_in_softirq_func(void)
{
lockdep_assert_in_softirq();
}
EXPORT_SYMBOL(lockdep_assert_in_softirq_func);
#endif

View File

@@ -283,9 +283,10 @@ static int xdp_recv_frames(struct xdp_frame **frames, int nframes,
static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
u32 repeat)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int err = 0, act, ret, i, nframes = 0, batch_sz;
struct xdp_frame **frames = xdp->frames;
struct bpf_redirect_info *ri;
struct xdp_page_head *head;
struct xdp_frame *frm;
bool redirect = false;
@@ -295,6 +296,8 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
batch_sz = min_t(u32, repeat, xdp->batch_size);
local_bh_disable();
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
ri = bpf_net_ctx_get_ri();
xdp_set_return_frame_no_direct();
for (i = 0; i < batch_sz; i++) {
@@ -359,6 +362,7 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
}
xdp_clear_return_frame_no_direct();
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
return err;
}
@@ -394,6 +398,7 @@ static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx,
static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
u32 *retval, u32 *time, bool xdp)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
struct bpf_prog_array_item item = {.prog = prog};
struct bpf_run_ctx *old_ctx;
struct bpf_cg_run_ctx run_ctx;
@@ -419,10 +424,14 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
do {
run_ctx.prog_item = &item;
local_bh_disable();
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (xdp)
*retval = bpf_prog_run_xdp(prog, ctx);
else
*retval = bpf_prog_run(prog, ctx);
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
} while (bpf_test_timer_continue(&t, 1, repeat, &ret, time));
bpf_reset_run_ctx(old_ctx);

View File

@@ -137,6 +137,7 @@ static inline bool is_pppoe_ipv6(const struct sk_buff *skb,
#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
struct brnf_frag_data {
local_lock_t bh_lock;
char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH];
u8 encap_size;
u8 size;
@@ -144,7 +145,9 @@ struct brnf_frag_data {
__be16 vlan_proto;
};
static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage);
static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage) = {
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
static void nf_bridge_info_free(struct sk_buff *skb)
{
@@ -850,6 +853,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
{
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
unsigned int mtu, mtu_reserved;
int ret;
mtu_reserved = nf_bridge_mtu_reduction(skb);
mtu = skb->dev->mtu;
@@ -882,6 +886,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
local_lock_nested_bh(&brnf_frag_data_storage.bh_lock);
data = this_cpu_ptr(&brnf_frag_data_storage);
if (skb_vlan_tag_present(skb)) {
@@ -897,7 +902,9 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
data->size);
return br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit);
ret = br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit);
local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
return ret;
}
if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) &&
skb->protocol == htons(ETH_P_IPV6)) {
@@ -909,6 +916,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
local_lock_nested_bh(&brnf_frag_data_storage.bh_lock);
data = this_cpu_ptr(&brnf_frag_data_storage);
data->encap_size = nf_bridge_encap_header_len(skb);
data->size = ETH_HLEN + data->encap_size;
@@ -916,8 +924,12 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
data->size);
if (v6ops)
return v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit);
if (v6ops) {
ret = v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit);
local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
return ret;
}
local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
kfree_skb(skb);
return -EMSGSIZE;

View File

@@ -229,7 +229,7 @@ static inline void backlog_lock_irq_save(struct softnet_data *sd,
{
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
else
local_irq_save(*flags);
}
@@ -237,7 +237,7 @@ static inline void backlog_lock_irq_disable(struct softnet_data *sd)
{
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_lock_irq(&sd->input_pkt_queue.lock);
else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
else
local_irq_disable();
}
@@ -246,7 +246,7 @@ static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
{
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
else
local_irq_restore(*flags);
}
@@ -254,7 +254,7 @@ static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
{
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_unlock_irq(&sd->input_pkt_queue.lock);
else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
else
local_irq_enable();
}
@@ -449,7 +449,9 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
* queue in the local softnet handler.
*/
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
.process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
};
EXPORT_PER_CPU_SYMBOL(softnet_data);
/* Page_pool has a lockless array/stack to alloc/recycle pages.
@@ -3940,6 +3942,7 @@ netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
}
#ifndef CONFIG_PREEMPT_RT
static bool netdev_xmit_txqueue_skipped(void)
{
return __this_cpu_read(softnet_data.xmit.skip_txqueue);
@@ -3950,6 +3953,19 @@ void netdev_xmit_skip_txqueue(bool skip)
__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
}
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
#else
static bool netdev_xmit_txqueue_skipped(void)
{
return current->net_xmit.skip_txqueue;
}
void netdev_xmit_skip_txqueue(bool skip)
{
current->net_xmit.skip_txqueue = skip;
}
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
#endif
#endif /* CONFIG_NET_EGRESS */
#ifdef CONFIG_NET_XGRESS
@@ -4029,10 +4045,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int sch_ret;
if (!entry)
return skb;
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
@@ -4061,10 +4080,12 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
break;
}
*ret = NET_RX_SUCCESS;
bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
case TC_ACT_SHOT:
kfree_skb_reason(skb, drop_reason);
*ret = NET_RX_DROP;
bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
/* used by tc_run */
case TC_ACT_STOLEN:
@@ -4074,8 +4095,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
fallthrough;
case TC_ACT_CONSUMED:
*ret = NET_RX_SUCCESS;
bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
}
bpf_net_ctx_clear(bpf_net_ctx);
return skb;
}
@@ -4085,11 +4108,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int sch_ret;
if (!entry)
return skb;
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
* already set by the caller.
*/
@@ -4105,10 +4131,12 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
/* No need to push/pop skb's mac_header here on egress! */
skb_do_redirect(skb);
*ret = NET_XMIT_SUCCESS;
bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
case TC_ACT_SHOT:
kfree_skb_reason(skb, drop_reason);
*ret = NET_XMIT_DROP;
bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
/* used by tc_run */
case TC_ACT_STOLEN:
@@ -4118,8 +4146,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
fallthrough;
case TC_ACT_CONSUMED:
*ret = NET_XMIT_SUCCESS;
bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
}
bpf_net_ctx_clear(bpf_net_ctx);
return skb;
}
@@ -5935,6 +5965,7 @@ static void flush_backlog(struct work_struct *work)
}
backlog_unlock_irq_enable(sd);
local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
@@ -5942,6 +5973,7 @@ static void flush_backlog(struct work_struct *work)
rps_input_queue_head_incr(sd);
}
}
local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
local_bh_enable();
}
@@ -6063,7 +6095,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
while (again) {
struct sk_buff *skb;
local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
while ((skb = __skb_dequeue(&sd->process_queue))) {
local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
rcu_read_lock();
__netif_receive_skb(skb);
rcu_read_unlock();
@@ -6072,7 +6106,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
return work;
}
local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
}
local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
backlog_lock_irq_disable(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
@@ -6087,8 +6123,10 @@ static int process_backlog(struct napi_struct *napi, int quota)
napi->state &= NAPIF_STATE_THREADED;
again = false;
} else {
local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
}
backlog_unlock_irq_enable(sd);
}
@@ -6301,6 +6339,7 @@ enum {
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
unsigned flags, u16 budget)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
bool skip_schedule = false;
unsigned long timeout;
int rc;
@@ -6318,6 +6357,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
local_bh_disable();
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (flags & NAPI_F_PREFER_BUSY_POLL) {
napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
@@ -6340,6 +6380,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
netpoll_poll_unlock(have_poll_lock);
if (rc == budget)
__busy_poll_stop(napi, skip_schedule);
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
}
@@ -6349,6 +6390,7 @@ static void __napi_busy_loop(unsigned int napi_id,
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
void *have_poll_lock = NULL;
struct napi_struct *napi;
@@ -6367,6 +6409,7 @@ static void __napi_busy_loop(unsigned int napi_id,
int work = 0;
local_bh_disable();
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (!napi_poll) {
unsigned long val = READ_ONCE(napi->state);
@@ -6397,6 +6440,7 @@ static void __napi_busy_loop(unsigned int napi_id,
__NET_ADD_STATS(dev_net(napi->dev),
LINUX_MIB_BUSYPOLLRXPACKETS, work);
skb_defer_free_flush(this_cpu_ptr(&softnet_data));
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
if (!loop_end || loop_end(loop_end_arg, start_time))
@@ -6824,6 +6868,7 @@ static int napi_thread_wait(struct napi_struct *napi)
static void napi_threaded_poll_loop(struct napi_struct *napi)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
struct softnet_data *sd;
unsigned long last_qs = jiffies;
@@ -6832,6 +6877,8 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
void *have;
local_bh_disable();
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
sd = this_cpu_ptr(&softnet_data);
sd->in_napi_threaded_poll = true;
@@ -6847,6 +6894,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
net_rps_action_and_irq_enable(sd);
}
skb_defer_free_flush(sd);
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
if (!repoll)
@@ -6872,10 +6920,12 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int budget = READ_ONCE(net_hotdata.netdev_budget);
LIST_HEAD(list);
LIST_HEAD(repoll);
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
start:
sd->in_net_rx_action = true;
local_irq_disable();
@@ -6928,7 +6978,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
sd->in_net_rx_action = false;
net_rps_action_and_irq_enable(sd);
end:;
end:
bpf_net_ctx_clear(bpf_net_ctx);
}
struct netdev_adjacent {

View File

@@ -150,6 +150,8 @@ struct napi_struct *napi_by_id(unsigned int napi_id);
void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
#define XMIT_RECURSION_LIMIT 8
#ifndef CONFIG_PREEMPT_RT
static inline bool dev_xmit_recursion(void)
{
return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
@@ -165,6 +167,22 @@ static inline void dev_xmit_recursion_dec(void)
{
__this_cpu_dec(softnet_data.xmit.recursion);
}
#else
static inline bool dev_xmit_recursion(void)
{
return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT);
}
static inline void dev_xmit_recursion_inc(void)
{
current->net_xmit.recursion++;
}
static inline void dev_xmit_recursion_dec(void)
{
current->net_xmit.recursion--;
}
#endif
int dev_set_hwtstamp_phylib(struct net_device *dev,
struct kernel_hwtstamp_config *cfg,

View File

@@ -1658,9 +1658,12 @@ struct bpf_scratchpad {
__be32 diff[MAX_BPF_STACK / sizeof(__be32)];
u8 buff[MAX_BPF_STACK];
};
local_lock_t bh_lock;
};
static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp) = {
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
static inline int __bpf_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
@@ -2021,6 +2024,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
u32 diff_size = from_size + to_size;
int i, j = 0;
__wsum ret;
/* This is quite flexible, some examples:
*
@@ -2034,12 +2038,15 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
diff_size > sizeof(sp->diff)))
return -EINVAL;
local_lock_nested_bh(&bpf_sp.bh_lock);
for (i = 0; i < from_size / sizeof(__be32); i++, j++)
sp->diff[j] = ~from[i];
for (i = 0; i < to_size / sizeof(__be32); i++, j++)
sp->diff[j] = to[i];
return csum_partial(sp->diff, diff_size, seed);
ret = csum_partial(sp->diff, diff_size, seed);
local_unlock_nested_bh(&bpf_sp.bh_lock);
return ret;
}
static const struct bpf_func_proto bpf_csum_diff_proto = {
@@ -2476,9 +2483,6 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
.arg3_type = ARG_ANYTHING,
};
DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
static struct net_device *skb_get_peer_dev(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -2491,7 +2495,7 @@ static struct net_device *skb_get_peer_dev(struct net_device *dev)
int skb_do_redirect(struct sk_buff *skb)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct net *net = dev_net(skb->dev);
struct net_device *dev;
u32 flags = ri->flags;
@@ -2524,7 +2528,7 @@ int skb_do_redirect(struct sk_buff *skb)
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return TC_ACT_SHOT;
@@ -2545,7 +2549,7 @@ static const struct bpf_func_proto bpf_redirect_proto = {
BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely(flags))
return TC_ACT_SHOT;
@@ -2567,7 +2571,7 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = {
BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
int, plen, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely((plen && plen < sizeof(*params)) || flags))
return TC_ACT_SHOT;
@@ -4293,30 +4297,13 @@ void xdp_do_check_flushed(struct napi_struct *napi)
}
#endif
void bpf_clear_redirect_map(struct bpf_map *map)
{
struct bpf_redirect_info *ri;
int cpu;
for_each_possible_cpu(cpu) {
ri = per_cpu_ptr(&bpf_redirect_info, cpu);
/* Avoid polluting remote cacheline due to writes if
* not needed. Once we pass this test, we need the
* cmpxchg() to make sure it hasn't been changed in
* the meantime by remote CPU.
*/
if (unlikely(READ_ONCE(ri->map) == map))
cmpxchg(&ri->map, map, NULL);
}
}
DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);
u32 xdp_master_redirect(struct xdp_buff *xdp)
{
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct net_device *master, *slave;
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
@@ -4388,7 +4375,7 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
map = READ_ONCE(ri->map);
/* The map pointer is cleared when the map is being torn
* down by bpf_clear_redirect_map()
* down by dev_map_free()
*/
if (unlikely(!map)) {
err = -ENOENT;
@@ -4433,7 +4420,7 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
if (map_type == BPF_MAP_TYPE_XSKMAP)
@@ -4447,7 +4434,7 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect);
int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
struct xdp_frame *xdpf, struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
if (map_type == BPF_MAP_TYPE_XSKMAP)
@@ -4464,7 +4451,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
enum bpf_map_type map_type, u32 map_id,
u32 flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct bpf_map *map;
int err;
@@ -4476,7 +4463,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
map = READ_ONCE(ri->map);
/* The map pointer is cleared when the map is being torn
* down by bpf_clear_redirect_map()
* down by dev_map_free()
*/
if (unlikely(!map)) {
err = -ENOENT;
@@ -4518,7 +4505,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
u32 map_id = ri->map_id;
@@ -4554,7 +4541,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely(flags))
return XDP_ABORTED;
@@ -6455,6 +6442,7 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
void *srh_tlvs, *srh_end, *ptr;
int srhoff = 0;
lockdep_assert_held(&srh_state->bh_lock);
if (srh == NULL)
return -EINVAL;
@@ -6511,6 +6499,7 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
int hdroff = 0;
int err;
lockdep_assert_held(&srh_state->bh_lock);
switch (action) {
case SEG6_LOCAL_ACTION_END_X:
if (!seg6_bpf_has_valid_srh(skb))
@@ -6587,6 +6576,7 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
int srhoff = 0;
int ret;
lockdep_assert_held(&srh_state->bh_lock);
if (unlikely(srh == NULL))
return -EINVAL;

View File

@@ -38,13 +38,14 @@ static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
struct dst_entry *dst, bool can_redirect)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int ret;
/* Migration disable and BH disable are needed to protect per-cpu
* redirect_info between BPF prog and skb_do_redirect().
/* Disabling BH is needed to protect per-CPU bpf_redirect_info between
* BPF prog and skb_do_redirect().
*/
migrate_disable();
local_bh_disable();
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
bpf_compute_data_pointers(skb);
ret = bpf_prog_run_save_cb(lwt->prog, skb);
@@ -77,8 +78,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
break;
}
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
migrate_enable();
return ret;
}

View File

@@ -277,6 +277,7 @@ static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
#endif
struct napi_alloc_cache {
local_lock_t bh_lock;
struct page_frag_cache page;
struct page_frag_1k page_small;
unsigned int skb_count;
@@ -284,7 +285,9 @@ struct napi_alloc_cache {
};
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
/* Double check that napi_get_frags() allocates skbs with
* skb->head being backed by slab, not a page fragment.
@@ -306,11 +309,16 @@ void napi_get_frags_check(struct napi_struct *napi)
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
void *data;
fragsz = SKB_DATA_ALIGN(fragsz);
return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
align_mask);
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
return data;
}
EXPORT_SYMBOL(__napi_alloc_frag_align);
@@ -318,19 +326,15 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
void *data;
fragsz = SKB_DATA_ALIGN(fragsz);
if (in_hardirq() || irqs_disabled()) {
struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
fragsz = SKB_DATA_ALIGN(fragsz);
data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,
align_mask);
} else {
struct napi_alloc_cache *nc;
local_bh_disable();
nc = this_cpu_ptr(&napi_alloc_cache);
data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
align_mask);
data = __napi_alloc_frag_align(fragsz, align_mask);
local_bh_enable();
}
return data;
@@ -342,16 +346,20 @@ static struct sk_buff *napi_skb_cache_get(void)
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!nc->skb_count)) {
nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
GFP_ATOMIC,
NAPI_SKB_CACHE_BULK,
nc->skb_cache);
if (unlikely(!nc->skb_count))
if (unlikely(!nc->skb_count)) {
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
return NULL;
}
}
skb = nc->skb_cache[--nc->skb_count];
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));
return skb;
@@ -744,9 +752,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
pfmemalloc = nc->pfmemalloc;
} else {
local_bh_disable();
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc(nc, len, gfp_mask);
pfmemalloc = nc->pfmemalloc;
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
local_bh_enable();
}
@@ -810,11 +822,11 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
goto skb_success;
}
nc = this_cpu_ptr(&napi_alloc_cache);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
nc = this_cpu_ptr(&napi_alloc_cache);
if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
/* we are artificially inflating the allocation size, but
* that is not as bad as it may look like, as:
@@ -836,6 +848,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
data = page_frag_alloc(&nc->page, len, gfp_mask);
pfmemalloc = nc->page.pfmemalloc;
}
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!data))
return NULL;
@@ -1435,6 +1448,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
if (!kasan_mempool_poison_object(skb))
return;
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
nc->skb_cache[nc->skb_count++] = skb;
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
@@ -1446,6 +1460,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
nc->skb_cache + NAPI_SKB_CACHE_HALF);
nc->skb_count = NAPI_SKB_CACHE_HALF;
}
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
}
void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)

View File

@@ -93,7 +93,9 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
static u32 tcp_v4_init_seq(const struct sk_buff *skb)
{
@@ -882,7 +884,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ctl_sk = this_cpu_read(ipv4_tcp_sk);
local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
sock_net_set(ctl_sk, net);
if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
@@ -907,6 +911,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
local_bh_enable();
#ifdef CONFIG_TCP_MD5SIG
@@ -1002,7 +1007,8 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ctl_sk = this_cpu_read(ipv4_tcp_sk);
local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
sock_net_set(ctl_sk, net);
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
@@ -1017,6 +1023,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
local_bh_enable();
}
@@ -3615,7 +3622,7 @@ void __init tcp_v4_init(void)
sk->sk_clockid = CLOCK_MONOTONIC;
per_cpu(ipv4_tcp_sk, cpu) = sk;
per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
}
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");

View File

@@ -10,7 +10,14 @@
#include <net/tcp.h>
static size_t __scratch_size;
static DEFINE_PER_CPU(void __rcu *, sigpool_scratch);
struct sigpool_scratch {
local_lock_t bh_lock;
void __rcu *pad;
};
static DEFINE_PER_CPU(struct sigpool_scratch, sigpool_scratch) = {
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
struct sigpool_entry {
struct crypto_ahash *hash;
@@ -72,7 +79,7 @@ static int sigpool_reserve_scratch(size_t size)
break;
}
old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch, cpu),
old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu),
scratch, lockdep_is_held(&cpool_mutex));
if (!cpu_online(cpu) || !old_scratch) {
kfree(old_scratch);
@@ -93,7 +100,7 @@ static void sigpool_scratch_free(void)
int cpu;
for_each_possible_cpu(cpu)
kfree(rcu_replace_pointer(per_cpu(sigpool_scratch, cpu),
kfree(rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu),
NULL, lockdep_is_held(&cpool_mutex)));
__scratch_size = 0;
}
@@ -277,7 +284,8 @@ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RC
/* Pairs with tcp_sigpool_reserve_scratch(), scratch area is
* valid (allocated) until tcp_sigpool_end().
*/
c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch));
local_lock_nested_bh(&sigpool_scratch.bh_lock);
c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch.pad));
return 0;
}
EXPORT_SYMBOL_GPL(tcp_sigpool_start);
@@ -286,6 +294,7 @@ void tcp_sigpool_end(struct tcp_sigpool *c) __releases(RCU_BH)
{
struct crypto_ahash *hash = crypto_ahash_reqtfm(c->req);
local_unlock_nested_bh(&sigpool_scratch.bh_lock);
rcu_read_unlock_bh();
ahash_request_free(c->req);
crypto_free_ahash(hash);

View File

@@ -1380,7 +1380,9 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
return err;
}
DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states) = {
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
{
@@ -1388,6 +1390,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
this_cpu_ptr(&seg6_bpf_srh_states);
struct ipv6_sr_hdr *srh = srh_state->srh;
lockdep_assert_held(&srh_state->bh_lock);
if (unlikely(srh == NULL))
return false;
@@ -1408,8 +1411,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
static int input_action_end_bpf(struct sk_buff *skb,
struct seg6_local_lwt *slwt)
{
struct seg6_bpf_srh_state *srh_state =
this_cpu_ptr(&seg6_bpf_srh_states);
struct seg6_bpf_srh_state *srh_state;
struct ipv6_sr_hdr *srh;
int ret;
@@ -1420,10 +1422,14 @@ static int input_action_end_bpf(struct sk_buff *skb,
}
advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
/* preempt_disable is needed to protect the per-CPU buffer srh_state,
* which is also accessed by the bpf_lwt_seg6_* helpers
/* The access to the per-CPU buffer srh_state is protected by running
* always in softirq context (with disabled BH). On PREEMPT_RT the
* required locking is provided by the following local_lock_nested_bh()
* statement. It is also accessed by the bpf_lwt_seg6_* helpers via
* bpf_prog_run_save_cb().
*/
preempt_disable();
local_lock_nested_bh(&seg6_bpf_srh_states.bh_lock);
srh_state = this_cpu_ptr(&seg6_bpf_srh_states);
srh_state->srh = srh;
srh_state->hdrlen = srh->hdrlen << 3;
srh_state->valid = true;
@@ -1446,15 +1452,15 @@ static int input_action_end_bpf(struct sk_buff *skb,
if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
goto drop;
local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock);
preempt_enable();
if (ret != BPF_REDIRECT)
seg6_lookup_nexthop(skb, NULL, 0);
return dst_input(skb);
drop:
preempt_enable();
local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock);
kfree_skb(skb);
return -EINVAL;
}

View File

@@ -35,8 +35,6 @@
#define TX_BATCH_SIZE 32
#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE)
static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -372,7 +370,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
{
struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list();
int err;
err = xsk_rcv(xs, xdp);
@@ -387,7 +385,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
void __xsk_map_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list();
struct xdp_sock *xs, *tmp;
list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
@@ -399,7 +397,7 @@ void __xsk_map_flush(void)
#ifdef CONFIG_DEBUG_NET
bool xsk_map_check_flush(void)
{
if (list_empty(this_cpu_ptr(&xskmap_flush_list)))
if (list_empty(bpf_net_ctx_get_xskmap_flush_list()))
return false;
__xsk_map_flush();
return true;
@@ -1772,7 +1770,7 @@ static struct pernet_operations xsk_net_ops = {
static int __init xsk_init(void)
{
int err, cpu;
int err;
err = proto_register(&xsk_proto, 0 /* no slab */);
if (err)
@@ -1790,8 +1788,6 @@ static int __init xsk_init(void)
if (err)
goto out_pernet;
for_each_possible_cpu(cpu)
INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
return 0;
out_pernet: