Florian Westphal says:

====================
netfilter: updates for net-next

First patch gets rid of refcounting for dying list dumping,
use a cookie value instead of keeping the object around.

Remaining patches extend nftables pipapo (concatenated ranges) set type.

Make the AVX2 optimized version available from the control plane as
well, then use it during insert.  This gives a nice speedup for large
sets. All from myself.

On PREEMPT_RT, we can't rely on local_bh_disable to protect the
access to the percpu scratch maps.  Use nested-BH locking for this,
From Sebastian Siewior.

* tag 'nf-next-25-08-20' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next:
  netfilter: nft_set_pipapo: Use nested-BH locking for nft_pipapo_scratch
  netfilter: nft_set_pipapo: Store real pointer, adjust later.
  netfilter: nft_set_pipapo: use avx2 algorithm for insertions too
  netfilter: nft_set_pipapo_avx2: split lookup function in two parts
  netfilter: nft_set_pipapo_avx2: Drop the comment regarding protection
  netfilter: ctnetlink: remove refcounting in dying list dumping
====================

Link: https://patch.msgid.link/20250820144738.24250-1-fw@strlen.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski
2025-08-21 17:23:26 -07:00
5 changed files with 160 additions and 129 deletions

View File

@@ -60,7 +60,7 @@ MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("List and change connection tracking table");
struct ctnetlink_list_dump_ctx {
struct nf_conn *last;
unsigned long last_id;
unsigned int cpu;
bool done;
};
@@ -1733,16 +1733,6 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb,
return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}
static int ctnetlink_done_list(struct netlink_callback *cb)
{
struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
if (ctx->last)
nf_ct_put(ctx->last);
return 0;
}
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int ctnetlink_dump_one_entry(struct sk_buff *skb,
struct netlink_callback *cb,
@@ -1757,11 +1747,11 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb,
if (l3proto && nf_ct_l3num(ct) != l3proto)
return 0;
if (ctx->last) {
if (ct != ctx->last)
if (ctx->last_id) {
if (ctnetlink_get_id(ct) != ctx->last_id)
return 0;
ctx->last = NULL;
ctx->last_id = 0;
}
/* We can't dump extension info for the unconfirmed
@@ -1775,12 +1765,8 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb,
cb->nlh->nlmsg_seq,
NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
ct, dying, 0);
if (res < 0) {
if (!refcount_inc_not_zero(&ct->ct_general.use))
return 0;
ctx->last = ct;
}
if (res < 0)
ctx->last_id = ctnetlink_get_id(ct);
return res;
}
@@ -1796,10 +1782,10 @@ static int
ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
struct nf_conn *last = ctx->last;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
const struct net *net = sock_net(skb->sk);
struct nf_conntrack_net_ecache *ecache_net;
unsigned long last_id = ctx->last_id;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
#endif
@@ -1807,7 +1793,7 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
if (ctx->done)
return 0;
ctx->last = NULL;
ctx->last_id = 0;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
ecache_net = nf_conn_pernet_ecache(net);
@@ -1818,24 +1804,21 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
int res;
ct = nf_ct_tuplehash_to_ctrack(h);
if (last && last != ct)
if (last_id && last_id != ctnetlink_get_id(ct))
continue;
res = ctnetlink_dump_one_entry(skb, cb, ct, true);
if (res < 0) {
spin_unlock_bh(&ecache_net->dying_lock);
nf_ct_put(last);
return skb->len;
}
nf_ct_put(last);
last = NULL;
last_id = 0;
}
spin_unlock_bh(&ecache_net->dying_lock);
#endif
ctx->done = true;
nf_ct_put(last);
return skb->len;
}
@@ -1847,7 +1830,6 @@ static int ctnetlink_get_ct_dying(struct sk_buff *skb,
if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_dying,
.done = ctnetlink_done_list,
};
return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
@@ -1862,7 +1844,6 @@ static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb,
if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_unconfirmed,
.done = ctnetlink_done_list,
};
return netlink_dump_start(info->sk, skb, info->nlh, &c);
}

View File

@@ -397,7 +397,7 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
}
/**
* pipapo_get() - Get matching element reference given key data
* pipapo_get_slow() - Get matching element reference given key data
* @m: storage containing the set elements
* @data: Key data to be matched against existing elements
* @genmask: If set, check that element is active in given genmask
@@ -414,12 +414,12 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
*
* Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
const u8 *data, u8 genmask,
u64 tstamp)
static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m,
const u8 *data, u8 genmask,
u64 tstamp)
{
unsigned long *res_map, *fill_map, *map;
struct nft_pipapo_scratch *scratch;
unsigned long *res_map, *fill_map;
const struct nft_pipapo_field *f;
bool map_index;
int i;
@@ -429,11 +429,13 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
scratch = *raw_cpu_ptr(m->scratch);
if (unlikely(!scratch))
goto out;
__local_lock_nested_bh(&scratch->bh_lock);
map_index = scratch->map_index;
res_map = scratch->map + (map_index ? m->bsize_max : 0);
fill_map = scratch->map + (map_index ? 0 : m->bsize_max);
map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]);
res_map = map + (map_index ? m->bsize_max : 0);
fill_map = map + (map_index ? 0 : m->bsize_max);
pipapo_resmap_init(m, res_map);
@@ -464,6 +466,7 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
last);
if (b < 0) {
scratch->map_index = map_index;
__local_unlock_nested_bh(&scratch->bh_lock);
local_bh_enable();
return NULL;
@@ -483,6 +486,7 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
* *next* bitmap (not initial) for the next packet.
*/
scratch->map_index = map_index;
__local_unlock_nested_bh(&scratch->bh_lock);
local_bh_enable();
return e;
}
@@ -497,11 +501,47 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
data += NFT_PIPAPO_GROUPS_PADDING(f);
}
__local_unlock_nested_bh(&scratch->bh_lock);
out:
local_bh_enable();
return NULL;
}
/**
* pipapo_get() - Get matching element reference given key data
* @m: Storage containing the set elements
* @data: Key data to be matched against existing elements
* @genmask: If set, check that element is active in given genmask
* @tstamp: Timestamp to check for expired elements
*
* This is a dispatcher function, either calling out the generic C
* implementation or, if available, the AVX2 one.
* This helper is only called from the control plane, with either RCU
* read lock or transaction mutex held.
*
* Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
const u8 *data, u8 genmask,
u64 tstamp)
{
struct nft_pipapo_elem *e;
local_bh_disable();
#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
irq_fpu_usable()) {
e = pipapo_get_avx2(m, data, genmask, tstamp);
local_bh_enable();
return e;
}
#endif
e = pipapo_get_slow(m, data, genmask, tstamp);
local_bh_enable();
return e;
}
/**
* nft_pipapo_lookup() - Dataplane fronted for main lookup function
* @net: Network namespace
@@ -523,7 +563,7 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
const struct nft_pipapo_elem *e;
m = rcu_dereference(priv->match);
e = pipapo_get(m, (const u8 *)key, genmask, get_jiffies_64());
e = pipapo_get_slow(m, (const u8 *)key, genmask, get_jiffies_64());
return e ? &e->ext : NULL;
}
@@ -1136,22 +1176,17 @@ static void pipapo_map(struct nft_pipapo_match *m,
}
/**
* pipapo_free_scratch() - Free per-CPU map at original (not aligned) address
* pipapo_free_scratch() - Free per-CPU map at original address
* @m: Matching data
* @cpu: CPU number
*/
static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu)
{
struct nft_pipapo_scratch *s;
void *mem;
s = *per_cpu_ptr(m->scratch, cpu);
if (!s)
return;
mem = s;
mem -= s->align_off;
kvfree(mem);
kvfree(s);
}
/**
@@ -1168,11 +1203,8 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
for_each_possible_cpu(i) {
struct nft_pipapo_scratch *scratch;
#ifdef NFT_PIPAPO_ALIGN
void *scratch_aligned;
u32 align_off;
#endif
scratch = kvzalloc_node(struct_size(scratch, map, bsize_max * 2) +
scratch = kvzalloc_node(struct_size(scratch, __map, bsize_max * 2) +
NFT_PIPAPO_ALIGN_HEADROOM,
GFP_KERNEL_ACCOUNT, cpu_to_node(i));
if (!scratch) {
@@ -1187,23 +1219,7 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
}
pipapo_free_scratch(clone, i);
#ifdef NFT_PIPAPO_ALIGN
/* Align &scratch->map (not the struct itself): the extra
* %NFT_PIPAPO_ALIGN_HEADROOM bytes passed to kzalloc_node()
* above guarantee we can waste up to those bytes in order
* to align the map field regardless of its offset within
* the struct.
*/
BUILD_BUG_ON(offsetof(struct nft_pipapo_scratch, map) > NFT_PIPAPO_ALIGN_HEADROOM);
scratch_aligned = NFT_PIPAPO_LT_ALIGN(&scratch->map);
scratch_aligned -= offsetof(struct nft_pipapo_scratch, map);
align_off = scratch_aligned - (void *)scratch;
scratch = scratch_aligned;
scratch->align_off = align_off;
#endif
local_lock_init(&scratch->bh_lock);
*per_cpu_ptr(clone->scratch, i) = scratch;
}

View File

@@ -124,14 +124,14 @@ struct nft_pipapo_field {
/**
* struct nft_pipapo_scratch - percpu data used for lookup and matching
* @bh_lock: PREEMPT_RT local spinlock
* @map_index: Current working bitmap index, toggled between field matches
* @align_off: Offset to get the originally allocated address
* @map: store partial matching results during lookup
* @__map: store partial matching results during lookup
*/
struct nft_pipapo_scratch {
local_lock_t bh_lock;
u8 map_index;
u32 align_off;
unsigned long map[];
unsigned long __map[];
};
/**

View File

@@ -1133,66 +1133,50 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns
}
/**
* nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
* @net: Network namespace
* @set: nftables API set representation
* @key: nftables API element representation containing key data
* pipapo_get_avx2() - Lookup function for AVX2 implementation
* @m: Storage containing the set elements
* @data: Key data to be matched against existing elements
* @genmask: If set, check that element is active in given genmask
* @tstamp: Timestamp to check for expired elements
*
* For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
*
* This implementation exploits the repetitive characteristic of the algorithm
* to provide a fast, vectorised version using the AVX2 SIMD instruction set.
*
* Return: true on match, false otherwise.
* The caller must check that the FPU is usable.
* This function must be called with BH disabled.
*
* Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
const struct nft_set_ext *
nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
const u32 *key)
struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m,
const u8 *data, u8 genmask,
u64 tstamp)
{
struct nft_pipapo *priv = nft_set_priv(set);
const struct nft_set_ext *ext = NULL;
struct nft_pipapo_scratch *scratch;
u8 genmask = nft_genmask_cur(net);
const struct nft_pipapo_match *m;
const struct nft_pipapo_field *f;
const u8 *rp = (const u8 *)key;
unsigned long *res, *fill;
unsigned long *res, *fill, *map;
bool map_index;
int i;
local_bh_disable();
scratch = *raw_cpu_ptr(m->scratch);
if (unlikely(!scratch))
return NULL;
if (unlikely(!irq_fpu_usable())) {
ext = nft_pipapo_lookup(net, set, key);
__local_lock_nested_bh(&scratch->bh_lock);
map_index = scratch->map_index;
map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]);
res = map + (map_index ? m->bsize_max : 0);
fill = map + (map_index ? 0 : m->bsize_max);
local_bh_enable();
return ext;
}
pipapo_resmap_init_avx2(m, res);
m = rcu_dereference(priv->match);
/* This also protects access to all data related to scratch maps.
*
* Note that we don't need a valid MXCSR state for any of the
/* Note that we don't need a valid MXCSR state for any of the
* operations we use here, so pass 0 as mask and spare a LDMXCSR
* instruction.
*/
kernel_fpu_begin_mask(0);
scratch = *raw_cpu_ptr(m->scratch);
if (unlikely(!scratch)) {
kernel_fpu_end();
local_bh_enable();
return NULL;
}
map_index = scratch->map_index;
res = scratch->map + (map_index ? m->bsize_max : 0);
fill = scratch->map + (map_index ? 0 : m->bsize_max);
pipapo_resmap_init_avx2(m, res);
nft_pipapo_avx2_prepare();
next_match:
@@ -1202,7 +1186,7 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \
(ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \
ret, rp, \
ret, data, \
first, last))
if (likely(f->bb == 8)) {
@@ -1218,7 +1202,7 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
} else {
ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f,
ret, rp,
ret, data,
first, last);
}
} else {
@@ -1234,7 +1218,7 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
} else {
ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f,
ret, rp,
ret, data,
first, last);
}
}
@@ -1242,29 +1226,75 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
#undef NFT_SET_PIPAPO_AVX2_LOOKUP
if (ret < 0)
goto out;
if (last) {
const struct nft_set_ext *e = &f->mt[ret].e->ext;
if (unlikely(nft_set_elem_expired(e) ||
!nft_set_elem_active(e, genmask)))
goto next_match;
ext = e;
goto out;
if (ret < 0) {
scratch->map_index = map_index;
kernel_fpu_end();
__local_unlock_nested_bh(&scratch->bh_lock);
return NULL;
}
if (last) {
struct nft_pipapo_elem *e;
e = f->mt[ret].e;
if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) ||
!nft_set_elem_active(&e->ext, genmask)))
goto next_match;
scratch->map_index = map_index;
kernel_fpu_end();
__local_unlock_nested_bh(&scratch->bh_lock);
return e;
}
map_index = !map_index;
swap(res, fill);
rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
data += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
out:
if (i % 2)
scratch->map_index = !map_index;
kernel_fpu_end();
__local_unlock_nested_bh(&scratch->bh_lock);
return NULL;
}
/**
* nft_pipapo_avx2_lookup() - Dataplane frontend for AVX2 implementation
* @net: Network namespace
* @set: nftables API set representation
* @key: nftables API element representation containing key data
*
* This function is called from the data path. It will search for
* an element matching the given key in the current active copy using
* the AVX2 routines if the FPU is usable or fall back to the generic
* implementation of the algorithm otherwise.
*
* Return: nftables API extension pointer or NULL if no match.
*/
const struct nft_set_ext *
nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
const u32 *key)
{
struct nft_pipapo *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
const struct nft_pipapo_match *m;
const u8 *rp = (const u8 *)key;
const struct nft_pipapo_elem *e;
local_bh_disable();
if (unlikely(!irq_fpu_usable())) {
const struct nft_set_ext *ext;
ext = nft_pipapo_lookup(net, set, key);
local_bh_enable();
return ext;
}
m = rcu_dereference(priv->match);
e = pipapo_get_avx2(m, rp, genmask, get_jiffies_64());
local_bh_enable();
return ext;
return e ? &e->ext : NULL;
}

View File

@@ -5,8 +5,12 @@
#include <asm/fpu/xstate.h>
#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE)
struct nft_pipapo_match;
bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est);
struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m,
const u8 *data, u8 genmask,
u64 tstamp);
#endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */
#endif /* _NFT_SET_PIPAPO_AVX2_H */