Merge branch 'inet-frags-fully-use-rcu'

Eric Dumazet says:

====================
inet: frags: fully use RCU

While inet reassembly uses RCU, it is acquiring/releasing
a refcount on struct inet_frag_queue in fast path,
for no good reason.

This was mentioned in one patch changelog seven years ago :/

This series is removing these refcount changes, by extending
RCU sections.
====================

Link: https://patch.msgid.link/20250312082250.1803501-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
Paolo Abeni
2025-03-18 13:18:37 +01:00
7 changed files with 89 additions and 84 deletions

View File

@@ -137,7 +137,7 @@ static inline void fqdir_pre_exit(struct fqdir *fqdir)
}
void fqdir_exit(struct fqdir *fqdir);
void inet_frag_kill(struct inet_frag_queue *q);
void inet_frag_kill(struct inet_frag_queue *q, int *refs);
void inet_frag_destroy(struct inet_frag_queue *q);
struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);
@@ -145,9 +145,9 @@ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);
unsigned int inet_frag_rbtree_purge(struct rb_root *root,
enum skb_drop_reason reason);
static inline void inet_frag_put(struct inet_frag_queue *q)
static inline void inet_frag_putn(struct inet_frag_queue *q, int refs)
{
if (refcount_dec_and_test(&q->refcnt))
if (refs && refcount_sub_and_test(refs, &q->refcnt))
inet_frag_destroy(q);
}

View File

@@ -66,6 +66,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
{
struct net_device *dev = NULL;
struct sk_buff *head;
int refs = 1;
rcu_read_lock();
/* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */
@@ -77,7 +78,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
goto out;
fq->q.flags |= INET_FRAG_DROP;
inet_frag_kill(&fq->q);
inet_frag_kill(&fq->q, &refs);
dev = dev_get_by_index_rcu(net, fq->iif);
if (!dev)
@@ -109,7 +110,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
spin_unlock(&fq->q.lock);
out_rcu_unlock:
rcu_read_unlock();
inet_frag_put(&fq->q);
inet_frag_putn(&fq->q, refs);
}
/* Check if the upper layer header is truncated in the first fragment. */

View File

@@ -31,7 +31,8 @@ static const char lowpan_frags_cache_name[] = "lowpan-frags";
static struct inet_frags lowpan_frags;
static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb,
struct sk_buff *prev, struct net_device *ldev);
struct sk_buff *prev, struct net_device *ldev,
int *refs);
static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
{
@@ -45,6 +46,7 @@ static void lowpan_frag_expire(struct timer_list *t)
{
struct inet_frag_queue *frag = from_timer(frag, t, timer);
struct frag_queue *fq;
int refs = 1;
fq = container_of(frag, struct frag_queue, q);
@@ -53,10 +55,10 @@ static void lowpan_frag_expire(struct timer_list *t)
if (fq->q.flags & INET_FRAG_COMPLETE)
goto out;
inet_frag_kill(&fq->q);
inet_frag_kill(&fq->q, &refs);
out:
spin_unlock(&fq->q.lock);
inet_frag_put(&fq->q);
inet_frag_putn(&fq->q, refs);
}
static inline struct lowpan_frag_queue *
@@ -82,7 +84,8 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
}
static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
struct sk_buff *skb, u8 frag_type)
struct sk_buff *skb, u8 frag_type,
int *refs)
{
struct sk_buff *prev_tail;
struct net_device *ldev;
@@ -143,7 +146,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
res = lowpan_frag_reasm(fq, skb, prev_tail, ldev);
res = lowpan_frag_reasm(fq, skb, prev_tail, ldev, refs);
skb->_skb_refdst = orefdst;
return res;
}
@@ -162,11 +165,12 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
* the last and the first frames arrived and all the bits are here.
*/
static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb,
struct sk_buff *prev_tail, struct net_device *ldev)
struct sk_buff *prev_tail, struct net_device *ldev,
int *refs)
{
void *reasm_data;
inet_frag_kill(&fq->q);
inet_frag_kill(&fq->q, refs);
reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail);
if (!reasm_data)
@@ -300,17 +304,20 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
goto err;
}
rcu_read_lock();
fq = fq_find(net, cb, &hdr.source, &hdr.dest);
if (fq != NULL) {
int ret;
int ret, refs = 0;
spin_lock(&fq->q.lock);
ret = lowpan_frag_queue(fq, skb, frag_type);
ret = lowpan_frag_queue(fq, skb, frag_type, &refs);
spin_unlock(&fq->q.lock);
inet_frag_put(&fq->q);
rcu_read_unlock();
inet_frag_putn(&fq->q, refs);
return ret;
}
rcu_read_unlock();
err:
kfree_skb(skb);

View File

@@ -145,8 +145,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
}
spin_unlock_bh(&fq->lock);
if (refcount_sub_and_test(count, &fq->refcnt))
inet_frag_destroy(fq);
inet_frag_putn(fq, count);
}
static LLIST_HEAD(fqdir_free_list);
@@ -226,10 +225,10 @@ void fqdir_exit(struct fqdir *fqdir)
}
EXPORT_SYMBOL(fqdir_exit);
void inet_frag_kill(struct inet_frag_queue *fq)
void inet_frag_kill(struct inet_frag_queue *fq, int *refs)
{
if (del_timer(&fq->timer))
refcount_dec(&fq->refcnt);
(*refs)++;
if (!(fq->flags & INET_FRAG_COMPLETE)) {
struct fqdir *fqdir = fq->fqdir;
@@ -244,7 +243,7 @@ void inet_frag_kill(struct inet_frag_queue *fq)
if (!READ_ONCE(fqdir->dead)) {
rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
fqdir->f->rhash_params);
refcount_dec(&fq->refcnt);
(*refs)++;
} else {
fq->flags |= INET_FRAG_HASH_DEAD;
}
@@ -328,7 +327,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
timer_setup(&q->timer, f->frag_expire, 0);
spin_lock_init(&q->lock);
refcount_set(&q->refcnt, 3);
/* One reference for the timer, one for the hash table. */
refcount_set(&q->refcnt, 2);
return q;
}
@@ -350,15 +350,20 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
*prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
&q->node, f->rhash_params);
if (*prev) {
/* We could not insert in the hash table,
* we need to cancel what inet_frag_alloc()
* anticipated.
*/
int refs = 1;
q->flags |= INET_FRAG_COMPLETE;
inet_frag_kill(q);
inet_frag_destroy(q);
inet_frag_kill(q, &refs);
inet_frag_putn(q, refs);
return NULL;
}
return q;
}
/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
{
/* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */
@@ -368,17 +373,11 @@ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
if (!high_thresh || frag_mem_limit(fqdir) > high_thresh)
return NULL;
rcu_read_lock();
prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
if (!prev)
fq = inet_frag_create(fqdir, key, &prev);
if (!IS_ERR_OR_NULL(prev)) {
if (!IS_ERR_OR_NULL(prev))
fq = prev;
if (!refcount_inc_not_zero(&fq->refcnt))
fq = NULL;
}
rcu_read_unlock();
return fq;
}
EXPORT_SYMBOL(inet_frag_find);

View File

@@ -76,7 +76,8 @@ static u8 ip4_frag_ecn(u8 tos)
static struct inet_frags ip4_frags;
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
struct sk_buff *prev_tail, struct net_device *dev);
struct sk_buff *prev_tail, struct net_device *dev,
int *refs);
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -107,22 +108,6 @@ static void ip4_frag_free(struct inet_frag_queue *q)
inet_putpeer(qp->peer);
}
/* Destruction primitives. */
static void ipq_put(struct ipq *ipq)
{
inet_frag_put(&ipq->q);
}
/* Kill ipq entry. It is not destroyed immediately,
* because caller (and someone more) holds reference count.
*/
static void ipq_kill(struct ipq *ipq)
{
inet_frag_kill(&ipq->q);
}
static bool frag_expire_skip_icmp(u32 user)
{
return user == IP_DEFRAG_AF_PACKET ||
@@ -143,6 +128,7 @@ static void ip_expire(struct timer_list *t)
struct sk_buff *head = NULL;
struct net *net;
struct ipq *qp;
int refs = 1;
qp = container_of(frag, struct ipq, q);
net = qp->q.fqdir->net;
@@ -159,7 +145,7 @@ static void ip_expire(struct timer_list *t)
goto out;
qp->q.flags |= INET_FRAG_DROP;
ipq_kill(qp);
inet_frag_kill(&qp->q, &refs);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
@@ -202,7 +188,7 @@ static void ip_expire(struct timer_list *t)
out_rcu_unlock:
rcu_read_unlock();
kfree_skb_reason(head, reason);
ipq_put(qp);
inet_frag_putn(&qp->q, refs);
}
/* Find the correct entry in the "incomplete datagrams" queue for
@@ -278,7 +264,7 @@ static int ip_frag_reinit(struct ipq *qp)
}
/* Add new segment to existing queue. */
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb, int *refs)
{
struct net *net = qp->q.fqdir->net;
int ihl, end, flags, offset;
@@ -298,7 +284,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
unlikely(ip_frag_too_far(qp)) &&
unlikely(err = ip_frag_reinit(qp))) {
ipq_kill(qp);
inet_frag_kill(&qp->q, refs);
goto err;
}
@@ -382,10 +368,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
err = ip_frag_reasm(qp, skb, prev_tail, dev);
err = ip_frag_reasm(qp, skb, prev_tail, dev, refs);
skb->_skb_refdst = orefdst;
if (err)
inet_frag_kill(&qp->q);
inet_frag_kill(&qp->q, refs);
return err;
}
@@ -402,7 +388,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
err = -EINVAL;
__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
discard_qp:
inet_frag_kill(&qp->q);
inet_frag_kill(&qp->q, refs);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
err:
kfree_skb_reason(skb, reason);
@@ -416,7 +402,8 @@ static bool ip_frag_coalesce_ok(const struct ipq *qp)
/* Build a new IP datagram from all its fragments. */
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
struct sk_buff *prev_tail, struct net_device *dev)
struct sk_buff *prev_tail, struct net_device *dev,
int *refs)
{
struct net *net = qp->q.fqdir->net;
struct iphdr *iph;
@@ -424,7 +411,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
int len, err;
u8 ecn;
ipq_kill(qp);
inet_frag_kill(&qp->q, refs);
ecn = ip_frag_ecn_table[qp->ecn];
if (unlikely(ecn == 0xff)) {
@@ -496,18 +483,21 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
/* Lookup (or create) queue header */
rcu_read_lock();
qp = ip_find(net, ip_hdr(skb), user, vif);
if (qp) {
int ret;
int ret, refs = 0;
spin_lock(&qp->q.lock);
ret = ip_frag_queue(qp, skb);
ret = ip_frag_queue(qp, skb, &refs);
spin_unlock(&qp->q.lock);
ipq_put(qp);
rcu_read_unlock();
inet_frag_putn(&qp->q, refs);
return ret;
}
rcu_read_unlock();
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);

View File

@@ -123,7 +123,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
#endif
static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
struct sk_buff *prev_tail, struct net_device *dev);
struct sk_buff *prev_tail, struct net_device *dev,
int *refs);
static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
{
@@ -167,7 +168,8 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
const struct frag_hdr *fhdr, int nhoff)
const struct frag_hdr *fhdr, int nhoff,
int *refs)
{
unsigned int payload_len;
struct net_device *dev;
@@ -221,7 +223,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
* this case. -DaveM
*/
pr_debug("end of fragment not rounded to 8 bytes.\n");
inet_frag_kill(&fq->q);
inet_frag_kill(&fq->q, refs);
return -EPROTO;
}
if (end > fq->q.len) {
@@ -287,7 +289,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
err = nf_ct_frag6_reasm(fq, skb, prev, dev);
err = nf_ct_frag6_reasm(fq, skb, prev, dev, refs);
skb->_skb_refdst = orefdst;
/* After queue has assumed skb ownership, only 0 or
@@ -301,7 +303,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
return -EINPROGRESS;
insert_error:
inet_frag_kill(&fq->q);
inet_frag_kill(&fq->q, refs);
err:
skb_dst_drop(skb);
return -EINVAL;
@@ -315,13 +317,14 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
* the last and the first frames arrived and all the bits are here.
*/
static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
struct sk_buff *prev_tail, struct net_device *dev)
struct sk_buff *prev_tail, struct net_device *dev,
int *refs)
{
void *reasm_data;
int payload_len;
u8 ecn;
inet_frag_kill(&fq->q);
inet_frag_kill(&fq->q, refs);
ecn = ip_frag_ecn_table[fq->ecn];
if (unlikely(ecn == 0xff))
@@ -372,7 +375,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
return 0;
err:
inet_frag_kill(&fq->q);
inet_frag_kill(&fq->q, refs);
return -EINVAL;
}
@@ -447,6 +450,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
struct frag_hdr *fhdr;
struct frag_queue *fq;
struct ipv6hdr *hdr;
int refs = 0;
u8 prevhdr;
/* Jumbo payload inhibits frag. header */
@@ -473,23 +477,26 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
hdr = ipv6_hdr(skb);
fhdr = (struct frag_hdr *)skb_transport_header(skb);
rcu_read_lock();
fq = fq_find(net, fhdr->identification, user, hdr,
skb->dev ? skb->dev->ifindex : 0);
if (fq == NULL) {
rcu_read_unlock();
pr_debug("Can't find and can't create new queue\n");
return -ENOMEM;
}
spin_lock_bh(&fq->q.lock);
ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff);
ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, &refs);
if (ret == -EPROTO) {
skb->transport_header = savethdr;
ret = 0;
}
spin_unlock_bh(&fq->q.lock);
inet_frag_put(&fq->q);
rcu_read_unlock();
inet_frag_putn(&fq->q, refs);
return ret;
}
EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);

View File

@@ -68,7 +68,8 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
static struct inet_frags ip6_frags;
static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
struct sk_buff *prev_tail, struct net_device *dev);
struct sk_buff *prev_tail, struct net_device *dev,
int *refs);
static void ip6_frag_expire(struct timer_list *t)
{
@@ -105,7 +106,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
struct frag_hdr *fhdr, int nhoff,
u32 *prob_offset)
u32 *prob_offset, int *refs)
{
struct net *net = dev_net(skb_dst(skb)->dev);
int offset, end, fragsize;
@@ -220,7 +221,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
err = ip6_frag_reasm(fq, skb, prev_tail, dev);
err = ip6_frag_reasm(fq, skb, prev_tail, dev, refs);
skb->_skb_refdst = orefdst;
return err;
}
@@ -238,7 +239,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_REASM_OVERLAPS);
discard_fq:
inet_frag_kill(&fq->q);
inet_frag_kill(&fq->q, refs);
__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_REASMFAILS);
err:
@@ -254,7 +255,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
* the last and the first frames arrived and all the bits are here.
*/
static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
struct sk_buff *prev_tail, struct net_device *dev)
struct sk_buff *prev_tail, struct net_device *dev,
int *refs)
{
struct net *net = fq->q.fqdir->net;
unsigned int nhoff;
@@ -262,7 +264,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
int payload_len;
u8 ecn;
inet_frag_kill(&fq->q);
inet_frag_kill(&fq->q, refs);
ecn = ip_frag_ecn_table[fq->ecn];
if (unlikely(ecn == 0xff))
@@ -303,9 +305,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
skb_postpush_rcsum(skb, skb_network_header(skb),
skb_network_header_len(skb));
rcu_read_lock();
__IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS);
rcu_read_unlock();
fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
fq->q.last_run_head = NULL;
@@ -317,10 +317,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
out_oom:
net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n");
out_fail:
rcu_read_lock();
__IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS);
rcu_read_unlock();
inet_frag_kill(&fq->q);
inet_frag_kill(&fq->q, refs);
return -1;
}
@@ -377,19 +375,21 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
}
iif = skb->dev ? skb->dev->ifindex : 0;
rcu_read_lock();
fq = fq_find(net, fhdr->identification, hdr, iif);
if (fq) {
u32 prob_offset = 0;
int ret;
int ret, refs = 0;
spin_lock(&fq->q.lock);
fq->iif = iif;
ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff,
&prob_offset);
&prob_offset, &refs);
spin_unlock(&fq->q.lock);
inet_frag_put(&fq->q);
rcu_read_unlock();
inet_frag_putn(&fq->q, refs);
if (prob_offset) {
__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
IPSTATS_MIB_INHDRERRORS);
@@ -398,6 +398,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
}
return ret;
}
rcu_read_unlock();
__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);