Merge branch 'improve-neigh_flush_dev-performance'

Gilad Naaman says:

====================
Improve neigh_flush_dev performance

This patchsets improves the performance of neigh_flush_dev.

Currently, the only way to implement it requires traversing
all neighbours known to the kernel, across all network-namespaces.

This means that some flows are slowed down as a function of neigh-scale,
even if the specific link they're handling has little to no neighbours.

In order to solve this, this patchset adds a netdev->neighbours list,
as well as making the original linked-list doubly-, so that it is
possible to unlink neighbours without traversing the hash-bucket to
obtain the previous neighbour.

The original use-case we encountered was mass-deletion of links (12K
VLANs) while there are 50K ARPs and 50K NDPs in the system; though the
slowdowns would also appear when the links are set down.
====================

Link: https://patch.msgid.link/20241107160444.2913124-1-gnaaman@drivenets.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski
2024-11-09 13:22:58 -08:00
6 changed files with 171 additions and 204 deletions

View File

@@ -188,4 +188,5 @@ u64 max_pacing_offload_horizon
struct_napi_config* napi_config
unsigned_long gro_flush_timeout
u32 napi_defer_hard_irqs
struct hlist_head neighbours[2]
=================================== =========================== =================== =================== ===================================================================================

View File

@@ -52,6 +52,7 @@
#include <net/net_trackers.h>
#include <net/net_debug.h>
#include <net/dropreason-core.h>
#include <net/neighbour_tables.h>
struct netpoll_info;
struct device;
@@ -2032,6 +2033,9 @@ enum netdev_reg_state {
* @napi_defer_hard_irqs: If not zero, provides a counter that would
* allow to avoid NIC hard IRQ, on busy queues.
*
* @neighbours: List heads pointing to this device's neighbours'
* dev_list, one per address-family.
*
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
*/
@@ -2440,6 +2444,9 @@ struct net_device {
*/
struct net_shaper_hierarchy *net_shaper_hierarchy;
#endif
struct hlist_head neighbours[NEIGH_NR_TABLES];
u8 priv[] ____cacheline_aligned
__counted_by(priv_len);
} ____cacheline_aligned;

View File

@@ -29,6 +29,7 @@
#include <linux/sysctl.h>
#include <linux/workqueue.h>
#include <net/rtnetlink.h>
#include <net/neighbour_tables.h>
/*
* NUD stands for "neighbor unreachability detection"
@@ -135,7 +136,8 @@ struct neigh_statistics {
#define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field)
struct neighbour {
struct neighbour __rcu *next;
struct hlist_node hash;
struct hlist_node dev_list;
struct neigh_table *tbl;
struct neigh_parms *parms;
unsigned long confirmed;
@@ -190,7 +192,7 @@ struct pneigh_entry {
#define NEIGH_NUM_HASH_RND 4
struct neigh_hash_table {
struct neighbour __rcu **hash_buckets;
struct hlist_head *hash_heads;
unsigned int hash_shift;
__u32 hash_rnd[NEIGH_NUM_HASH_RND];
struct rcu_head rcu;
@@ -236,13 +238,6 @@ struct neigh_table {
struct pneigh_entry **phash_buckets;
};
enum {
NEIGH_ARP_TABLE = 0,
NEIGH_ND_TABLE = 1,
NEIGH_NR_TABLES,
NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */
};
static inline int neigh_parms_family(struct neigh_parms *p)
{
return p->tbl->family;
@@ -275,6 +270,12 @@ static inline void *neighbour_priv(const struct neighbour *n)
extern const struct nla_policy nda_policy[];
#define neigh_for_each_in_bucket(pos, head) hlist_for_each_entry(pos, head, hash)
#define neigh_for_each_in_bucket_rcu(pos, head) \
hlist_for_each_entry_rcu(pos, head, hash)
#define neigh_for_each_in_bucket_safe(pos, tmp, head) \
hlist_for_each_entry_safe(pos, tmp, head, hash)
static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey)
{
return *(const u32 *)n->primary_key == *(const u32 *)pkey;
@@ -303,12 +304,9 @@ static inline struct neighbour *___neigh_lookup_noref(
u32 hash_val;
hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
for (n = rcu_dereference(nht->hash_buckets[hash_val]);
n != NULL;
n = rcu_dereference(n->next)) {
neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[hash_val])
if (n->dev == dev && key_eq(n, pkey))
return n;
}
return NULL;
}
@@ -349,7 +347,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags,
u32 nlmsg_pid);
void __neigh_set_probe_once(struct neighbour *neigh);
bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl);
bool neigh_remove_one(struct neighbour *ndel);
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev);

View File

@@ -0,0 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_NEIGHBOUR_TABLES_H
#define _NET_NEIGHBOUR_TABLES_H
enum {
NEIGH_ARP_TABLE = 0,
NEIGH_ND_TABLE = 1,
NEIGH_NR_TABLES,
NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */
};
#endif

View File

@@ -60,6 +60,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
static const struct seq_operations neigh_stat_seq_ops;
#endif
static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family)
{
int i;
switch (family) {
default:
DEBUG_NET_WARN_ON_ONCE(1);
fallthrough; /* to avoid panic by null-ptr-deref */
case AF_INET:
i = NEIGH_ARP_TABLE;
break;
case AF_INET6:
i = NEIGH_ND_TABLE;
break;
}
return &dev->neighbours[i];
}
/*
Neighbour hash table buckets are protected with rwlock tbl->lock.
@@ -204,18 +223,14 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify,
}
}
static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np,
struct neigh_table *tbl)
bool neigh_remove_one(struct neighbour *n)
{
bool retval = false;
write_lock(&n->lock);
if (refcount_read(&n->refcnt) == 1) {
struct neighbour *neigh;
neigh = rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock));
rcu_assign_pointer(*np, neigh);
hlist_del_rcu(&n->hash);
hlist_del_rcu(&n->dev_list);
neigh_mark_dead(n);
retval = true;
}
@@ -225,29 +240,6 @@ static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np,
return retval;
}
bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
{
struct neigh_hash_table *nht;
void *pkey = ndel->primary_key;
u32 hash_val;
struct neighbour *n;
struct neighbour __rcu **np;
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd);
hash_val = hash_val >> (32 - nht->hash_shift);
np = &nht->hash_buckets[hash_val];
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock)))) {
if (n == ndel)
return neigh_del(n, np, tbl);
np = &n->next;
}
return false;
}
static int neigh_forced_gc(struct neigh_table *tbl)
{
int max_clean = atomic_read(&tbl->gc_entries) -
@@ -275,7 +267,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
remove = true;
write_unlock(&n->lock);
if (remove && neigh_remove_one(n, tbl))
if (remove && neigh_remove_one(n))
shrunk++;
if (shrunk >= max_clean)
break;
@@ -379,54 +371,42 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
bool skip_perm)
{
int i;
struct neigh_hash_table *nht;
struct hlist_head *dev_head;
struct hlist_node *tmp;
struct neighbour *n;
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
dev_head = neigh_get_dev_table(dev, tbl->family);
for (i = 0; i < (1 << nht->hash_shift); i++) {
struct neighbour *n;
struct neighbour __rcu **np = &nht->hash_buckets[i];
hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) {
if (skip_perm && n->nud_state & NUD_PERMANENT)
continue;
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock))) != NULL) {
if (dev && n->dev != dev) {
np = &n->next;
continue;
}
if (skip_perm && n->nud_state & NUD_PERMANENT) {
np = &n->next;
continue;
}
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)));
write_lock(&n->lock);
neigh_del_timer(n);
neigh_mark_dead(n);
if (refcount_read(&n->refcnt) != 1) {
/* The most unpleasant situation.
We must destroy neighbour entry,
but someone still uses it.
The destroy will be delayed until
the last user releases us, but
we must kill timers etc. and move
it to safe state.
*/
__skb_queue_purge(&n->arp_queue);
n->arp_queue_len_bytes = 0;
WRITE_ONCE(n->output, neigh_blackhole);
if (n->nud_state & NUD_VALID)
n->nud_state = NUD_NOARP;
else
n->nud_state = NUD_NONE;
neigh_dbg(2, "neigh %p is stray\n", n);
}
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
hlist_del_rcu(&n->hash);
hlist_del_rcu(&n->dev_list);
write_lock(&n->lock);
neigh_del_timer(n);
neigh_mark_dead(n);
if (refcount_read(&n->refcnt) != 1) {
/* The most unpleasant situation.
* We must destroy neighbour entry,
* but someone still uses it.
*
* The destroy will be delayed until
* the last user releases us, but
* we must kill timers etc. and move
* it to safe state.
*/
__skb_queue_purge(&n->arp_queue);
n->arp_queue_len_bytes = 0;
WRITE_ONCE(n->output, neigh_blackhole);
if (n->nud_state & NUD_VALID)
n->nud_state = NUD_NOARP;
else
n->nud_state = NUD_NONE;
neigh_dbg(2, "neigh %p is stray\n", n);
}
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
}
}
@@ -529,20 +509,21 @@ static void neigh_get_hash_rnd(u32 *x)
static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
{
size_t size = (1 << shift) * sizeof(struct neighbour *);
size_t size = (1 << shift) * sizeof(struct hlist_head);
struct hlist_head *hash_heads;
struct neigh_hash_table *ret;
struct neighbour __rcu **buckets;
int i;
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
return NULL;
buckets = kvzalloc(size, GFP_ATOMIC);
if (!buckets) {
hash_heads = kvzalloc(size, GFP_ATOMIC);
if (!hash_heads) {
kfree(ret);
return NULL;
}
ret->hash_buckets = buckets;
ret->hash_heads = hash_heads;
ret->hash_shift = shift;
for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
neigh_get_hash_rnd(&ret->hash_rnd[i]);
@@ -555,7 +536,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
struct neigh_hash_table,
rcu);
kvfree(nht->hash_buckets);
kvfree(nht->hash_heads);
kfree(nht);
}
@@ -574,24 +555,17 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
return old_nht;
for (i = 0; i < (1 << old_nht->hash_shift); i++) {
struct neighbour *n, *next;
struct hlist_node *tmp;
struct neighbour *n;
for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
lockdep_is_held(&tbl->lock));
n != NULL;
n = next) {
neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) {
hash = tbl->hash(n->primary_key, n->dev,
new_nht->hash_rnd);
hash >>= (32 - new_nht->hash_shift);
next = rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock));
rcu_assign_pointer(n->next,
rcu_dereference_protected(
new_nht->hash_buckets[hash],
lockdep_is_held(&tbl->lock)));
rcu_assign_pointer(new_nht->hash_buckets[hash], n);
hlist_del_rcu(&n->hash);
hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]);
}
}
@@ -678,11 +652,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_tbl_unlock;
}
for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
lockdep_is_held(&tbl->lock));
n1 != NULL;
n1 = rcu_dereference_protected(n1->next,
lockdep_is_held(&tbl->lock))) {
neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) {
if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
if (want_ref)
neigh_hold(n1);
@@ -698,10 +668,11 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
list_add_tail(&n->managed_list, &n->tbl->managed_list);
if (want_ref)
neigh_hold(n);
rcu_assign_pointer(n->next,
rcu_dereference_protected(nht->hash_buckets[hash_val],
lockdep_is_held(&tbl->lock)));
rcu_assign_pointer(nht->hash_buckets[hash_val], n);
hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]);
hlist_add_head_rcu(&n->dev_list,
neigh_get_dev_table(dev, tbl->family));
write_unlock_bh(&tbl->lock);
neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
@@ -933,10 +904,10 @@ static void neigh_connect(struct neighbour *neigh)
static void neigh_periodic_work(struct work_struct *work)
{
struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
struct neighbour *n;
struct neighbour __rcu **np;
unsigned int i;
struct neigh_hash_table *nht;
struct hlist_node *tmp;
struct neighbour *n;
unsigned int i;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
@@ -961,10 +932,7 @@ static void neigh_periodic_work(struct work_struct *work)
goto out;
for (i = 0 ; i < (1 << nht->hash_shift); i++) {
np = &nht->hash_buckets[i];
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock))) != NULL) {
neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
unsigned int state;
write_lock(&n->lock);
@@ -973,7 +941,7 @@ static void neigh_periodic_work(struct work_struct *work)
if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
(n->flags & NTF_EXT_LEARNED)) {
write_unlock(&n->lock);
goto next_elt;
continue;
}
if (time_before(n->used, n->confirmed) &&
@@ -984,18 +952,14 @@ static void neigh_periodic_work(struct work_struct *work)
(state == NUD_FAILED ||
!time_in_range_open(jiffies, n->used,
n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)));
hlist_del_rcu(&n->hash);
hlist_del_rcu(&n->dev_list);
neigh_mark_dead(n);
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
continue;
}
write_unlock(&n->lock);
next_elt:
np = &n->next;
}
/*
* It's fine to release lock here, even if hash table
@@ -1942,7 +1906,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
NETLINK_CB(skb).portid, extack);
write_lock_bh(&tbl->lock);
neigh_release(neigh);
neigh_remove_one(neigh, tbl);
neigh_remove_one(neigh);
write_unlock_bh(&tbl->lock);
out:
@@ -2713,9 +2677,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
for (h = s_h; h < (1 << nht->hash_shift); h++) {
if (h > s_h)
s_idx = 0;
for (n = rcu_dereference(nht->hash_buckets[h]), idx = 0;
n != NULL;
n = rcu_dereference(n->next)) {
idx = 0;
neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) {
if (idx < s_idx || !net_eq(dev_net(n->dev), net))
goto next;
if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
@@ -3082,9 +3045,7 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
for (n = rcu_dereference(nht->hash_buckets[chain]);
n != NULL;
n = rcu_dereference(n->next))
neigh_for_each_in_bucket(n, &nht->hash_heads[chain])
cb(n, cookie);
}
read_unlock_bh(&tbl->lock);
@@ -3096,29 +3057,25 @@ EXPORT_SYMBOL(neigh_for_each);
void __neigh_for_each_release(struct neigh_table *tbl,
int (*cb)(struct neighbour *))
{
int chain;
struct neigh_hash_table *nht;
int chain;
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct hlist_node *tmp;
struct neighbour *n;
struct neighbour __rcu **np;
np = &nht->hash_buckets[chain];
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock))) != NULL) {
neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) {
int release;
write_lock(&n->lock);
release = cb(n);
if (release) {
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)));
hlist_del_rcu(&n->hash);
hlist_del_rcu(&n->dev_list);
neigh_mark_dead(n);
} else
np = &n->next;
}
write_unlock(&n->lock);
if (release)
neigh_cleanup_and_release(n);
@@ -3175,43 +3132,53 @@ EXPORT_SYMBOL(neigh_xmit);
#ifdef CONFIG_PROC_FS
static struct neighbour *neigh_get_first(struct seq_file *seq)
static struct neighbour *neigh_get_valid(struct seq_file *seq,
struct neighbour *n,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
if (!net_eq(dev_net(n->dev), net))
return NULL;
if (state->neigh_sub_iter) {
loff_t fakep = 0;
void *v;
v = state->neigh_sub_iter(state, n, pos ? pos : &fakep);
if (!v)
return NULL;
if (pos)
return v;
}
if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
return n;
if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
return n;
return NULL;
}
static struct neighbour *neigh_get_first(struct seq_file *seq)
{
struct neigh_seq_state *state = seq->private;
struct neigh_hash_table *nht = state->nht;
struct neighbour *n = NULL;
int bucket;
struct neighbour *n, *tmp;
state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
n = rcu_dereference(nht->hash_buckets[bucket]);
while (n) {
if (!net_eq(dev_net(n->dev), net))
goto next;
if (state->neigh_sub_iter) {
loff_t fakep = 0;
void *v;
v = state->neigh_sub_iter(state, n, &fakep);
if (!v)
goto next;
}
if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
break;
if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
break;
next:
n = rcu_dereference(n->next);
while (++state->bucket < (1 << nht->hash_shift)) {
neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) {
tmp = neigh_get_valid(seq, n, NULL);
if (tmp)
return tmp;
}
if (n)
break;
}
state->bucket = bucket;
return n;
return NULL;
}
static struct neighbour *neigh_get_next(struct seq_file *seq,
@@ -3219,46 +3186,28 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
struct neigh_hash_table *nht = state->nht;
struct neighbour *tmp;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
if (v)
return n;
}
n = rcu_dereference(n->next);
while (1) {
while (n) {
if (!net_eq(dev_net(n->dev), net))
goto next;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
if (v)
return n;
goto next;
}
if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
break;
if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
break;
next:
n = rcu_dereference(n->next);
hlist_for_each_entry_continue(n, hash) {
tmp = neigh_get_valid(seq, n, pos);
if (tmp) {
n = tmp;
goto out;
}
if (n)
break;
if (++state->bucket >= (1 << nht->hash_shift))
break;
n = rcu_dereference(nht->hash_buckets[state->bucket]);
}
n = neigh_get_first(seq);
out:
if (n && pos)
--(*pos);
return n;
}
@@ -3361,7 +3310,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
struct neigh_seq_state *state = seq->private;
state->tbl = tbl;
state->bucket = 0;
state->bucket = -1;
state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
rcu_read_lock();

View File

@@ -1215,7 +1215,7 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
NEIGH_UPDATE_F_ADMIN, 0);
write_lock_bh(&tbl->lock);
neigh_release(neigh);
neigh_remove_one(neigh, tbl);
neigh_remove_one(neigh);
write_unlock_bh(&tbl->lock);
}