From 804b09be09f8af4eda5346a72361459ba21fcf1b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:29 +0300 Subject: [PATCH 01/15] vxlan: Add RCU read-side critical sections in the Tx path The Tx path does not run from an RCU read-side critical section which makes the current lockless accesses to FDB entries invalid. As far as I am aware, this has not been a problem in practice, but traces will be generated once we transition the FDB lookup to rhashtable_lookup(). Add rcu_read_{lock,unlock}() around the handling of FDB entries in the Tx path. Remove the RCU read-side critical section from vxlan_xmit_nh() as now the function is always called from an RCU read-side critical section. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-2-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 56aee539c235..7872b85e890e 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1916,12 +1916,15 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; } + rcu_read_lock(); f = vxlan_find_mac(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); + rcu_read_unlock(); goto out; } + rcu_read_unlock(); reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, n->ha, sha); @@ -2648,14 +2651,10 @@ static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); hash = skb_get_hash(skb); - rcu_read_lock(); nh = rcu_dereference(f->nh); - if (!nh) { - rcu_read_unlock(); + if (!nh) goto drop; - } do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); - rcu_read_unlock(); if (likely(do_xmit)) vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc); @@ -2782,6 +2781,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } eth = eth_hdr(skb); + rcu_read_lock(); f = vxlan_find_mac(vxlan, eth->h_dest, vni); did_rsc = false; @@ -2804,7 +2804,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); - return NETDEV_TX_OK; + goto out; } } @@ -2829,6 +2829,8 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); } +out: + rcu_read_unlock(); return NETDEV_TX_OK; } From 884dd448f1ac8b5c5c0dcef9bcaab7c16ee48276 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:30 +0300 Subject: [PATCH 02/15] vxlan: Simplify creation of default FDB entry There is asymmetry in how the default FDB entry (all-zeroes) is created and destroyed in the VXLAN driver. It is created as part of the driver's newlink() routine, but destroyed as part of its ndo_uninit() routine. This caused multiple problems in the past. First, commit 0241b836732f ("vxlan: fix default fdb entry netlink notify ordering during netdev create") split the notification about the entry from its creation so that it will not be notified to user space before the VXLAN device is registered. Then, commit 6db924687139 ("vxlan: Fix error path in __vxlan_dev_create()") made the error path in __vxlan_dev_create() asymmetric by destroying the FDB entry before unregistering the net device. Otherwise, the FDB entry would have been freed twice: By ndo_uninit() as part of unregister_netdevice() and by vxlan_fdb_destroy() in the error path. Finally, commit 7c31e54aeee5 ("vxlan: do not destroy fdb if register_netdevice() is failed") split the insertion of the FDB entry into the hash table from its creation, moving the insertion after the registration of the net device. Otherwise, like before, the FDB entry would have been freed twice: By ndo_uninit() as part of register_netdevice()'s error path and by vxlan_fdb_destroy() in the error path of __vxlan_dev_create(). The end result is that the code is unnecessarily complex. In addition, the fixed size hash table cannot be converted to rhashtable as vxlan_fdb_insert() cannot fail, which will no longer be true with rhashtable. Solve this by making the addition and deletion of the default FDB entry completely symmetric. Namely, as part of newlink() routine, create the entry, insert it into to the hash table and send a notification to user space after the net device was registered. Note that at this stage the net device is still administratively down and cannot transmit / receive packets. Move the deletion from ndo_uninit() to the dellink routine(): Flush the default entry together with all the other entries, before unregistering the net device. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-3-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 86 ++++++++++++---------------------- 1 file changed, 29 insertions(+), 57 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 7872b85e890e..3df86927b1ec 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2930,18 +2930,6 @@ static int vxlan_init(struct net_device *dev) return err; } -static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni) -{ - struct vxlan_fdb *f; - u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni); - - spin_lock_bh(&vxlan->hash_lock[hash_index]); - f = __vxlan_find_mac(vxlan, all_zeros_mac, vni); - if (f) - vxlan_fdb_destroy(vxlan, f, true, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); -} - static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -2952,8 +2940,6 @@ static void vxlan_uninit(struct net_device *dev) vxlan_vnigroup_uninit(vxlan); gro_cells_destroy(&vxlan->gro_cells); - - vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni); } /* Start ageing timer and join group when device is brought up */ @@ -3187,7 +3173,7 @@ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = { - /* Default entry is deleted at vxlan_uninit. */ + /* Default entry is deleted at vxlan_dellink. */ .ignore_default_entry = true, .state = 0, .state_mask = NUD_PERMANENT | NUD_NOARP, @@ -3963,7 +3949,6 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *remote_dev = NULL; struct vxlan_fdb *f = NULL; - bool unregister = false; struct vxlan_rdst *dst; int err; @@ -3974,6 +3959,26 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, dev->ethtool_ops = &vxlan_ethtool_ops; + err = register_netdevice(dev); + if (err) + return err; + + if (dst->remote_ifindex) { + remote_dev = __dev_get_by_index(net, dst->remote_ifindex); + if (!remote_dev) { + err = -ENODEV; + goto unregister; + } + + err = netdev_upper_dev_link(remote_dev, dev, extack); + if (err) + goto unregister; + } + + err = rtnl_configure_link(dev, NULL, 0, NULL); + if (err < 0) + goto unlink; + /* create an fdb entry for a valid default destination */ if (!vxlan_addr_any(&dst->remote_ip)) { err = vxlan_fdb_create(vxlan, all_zeros_mac, @@ -3985,61 +3990,31 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, dst->remote_ifindex, NTF_SELF, 0, &f, extack); if (err) - return err; + goto unlink; } - err = register_netdevice(dev); - if (err) - goto errout; - unregister = true; - - if (dst->remote_ifindex) { - remote_dev = __dev_get_by_index(net, dst->remote_ifindex); - if (!remote_dev) { - err = -ENODEV; - goto errout; - } - - err = netdev_upper_dev_link(remote_dev, dev, extack); - if (err) - goto errout; - } - - err = rtnl_configure_link(dev, NULL, 0, NULL); - if (err < 0) - goto unlink; - if (f) { vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f); /* notify default fdb entry */ err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, true, extack); - if (err) { - vxlan_fdb_destroy(vxlan, f, false, false); - if (remote_dev) - netdev_upper_dev_unlink(remote_dev, dev); - goto unregister; - } + if (err) + goto fdb_destroy; } list_add(&vxlan->next, &vn->vxlan_list); if (remote_dev) dst->remote_dev = remote_dev; return 0; + +fdb_destroy: + vxlan_fdb_destroy(vxlan, f, false, false); unlink: if (remote_dev) netdev_upper_dev_unlink(remote_dev, dev); -errout: - /* unregister_netdevice() destroys the default FDB entry with deletion - * notification. But the addition notification was not sent yet, so - * destroy the entry by hand here. - */ - if (f) - __vxlan_fdb_free(f); unregister: - if (unregister) - unregister_netdevice(dev); + unregister_netdevice(dev); return err; } @@ -4520,10 +4495,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_fdb_flush_desc desc = { - /* Default entry is deleted at vxlan_uninit. */ - .ignore_default_entry = true, - }; + struct vxlan_fdb_flush_desc desc = {}; vxlan_flush(vxlan, &desc); From 69281e0fe18ab60e7b67e328214ffbd8609b13b4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:31 +0300 Subject: [PATCH 03/15] vxlan: Insert FDB into hash table in vxlan_fdb_create() Commit 7c31e54aeee5 ("vxlan: do not destroy fdb if register_netdevice() is failed") split the insertion of FDB entries into the FDB hash table from the function where they are created. This was done in order to work around a problem that is no longer possible after the previous patch. Simplify the code and move the body of vxlan_fdb_insert() back into vxlan_fdb_create(). Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-4-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 3df86927b1ec..915ce73f0c87 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -816,14 +816,6 @@ static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac, return f; } -static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac, - __be32 src_vni, struct vxlan_fdb *f) -{ - ++vxlan->addrcnt; - hlist_add_head_rcu(&f->hlist, - vxlan_fdb_head(vxlan, mac, src_vni)); -} - static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, u32 nhid, struct netlink_ext_ack *extack) { @@ -913,6 +905,10 @@ int vxlan_fdb_create(struct vxlan_dev *vxlan, if (rc < 0) goto errout; + ++vxlan->addrcnt; + hlist_add_head_rcu(&f->hlist, + vxlan_fdb_head(vxlan, mac, src_vni)); + *fdb = f; return 0; @@ -1101,7 +1097,6 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, if (rc < 0) return rc; - vxlan_fdb_insert(vxlan, mac, src_vni, f); rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, swdev_notify, extack); if (rc) @@ -3994,8 +3989,6 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, } if (f) { - vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f); - /* notify default fdb entry */ err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, true, extack); From ccc203b9a846c1d5262b5c60565ff79930c13477 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:32 +0300 Subject: [PATCH 04/15] vxlan: Unsplit default FDB entry creation and notification Commit 0241b836732f ("vxlan: fix default fdb entry netlink notify ordering during netdev create") split the creation of the default FDB entry from its notification to avoid sending a RTM_NEWNEIGH notification before RTM_NEWLINK. Previous patches restructured the code so that the default FDB entry is created after registering the VXLAN device and the notification about the new entry immediately follows its creation. Therefore, simplify the code and revert back to vxlan_fdb_update() which takes care of both creating the FDB entry and notifying user space about it. Hold the FDB hash lock when calling vxlan_fdb_update() like it expects. A subsequent patch will add a lockdep assertion to make sure this is indeed the case. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-5-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 915ce73f0c87..d3dfc4af9556 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -3943,7 +3943,6 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *remote_dev = NULL; - struct vxlan_fdb *f = NULL; struct vxlan_rdst *dst; int err; @@ -3976,33 +3975,29 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, /* create an fdb entry for a valid default destination */ if (!vxlan_addr_any(&dst->remote_ip)) { - err = vxlan_fdb_create(vxlan, all_zeros_mac, + u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, + dst->remote_vni); + + spin_lock_bh(&vxlan->hash_lock[hash_index]); + err = vxlan_fdb_update(vxlan, all_zeros_mac, &dst->remote_ip, NUD_REACHABLE | NUD_PERMANENT, + NLM_F_EXCL | NLM_F_CREATE, vxlan->cfg.dst_port, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, - NTF_SELF, 0, &f, extack); + NTF_SELF, 0, true, extack); + spin_unlock_bh(&vxlan->hash_lock[hash_index]); if (err) goto unlink; } - if (f) { - /* notify default fdb entry */ - err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), - RTM_NEWNEIGH, true, extack); - if (err) - goto fdb_destroy; - } - list_add(&vxlan->next, &vn->vxlan_list); if (remote_dev) dst->remote_dev = remote_dev; return 0; -fdb_destroy: - vxlan_fdb_destroy(vxlan, f, false, false); unlink: if (remote_dev) netdev_upper_dev_unlink(remote_dev, dev); From 6ba480cca25f26bd0f0e328b2d2f8eab6bde7cc0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:33 +0300 Subject: [PATCH 05/15] vxlan: Relocate assignment of default remote device The default FDB entry can be associated with a net device if a physical device (i.e., 'dev PHYS_DEV') was specified during the creation of the VXLAN device. The assignment of the net device pointer to 'dst->remote_dev' logically belongs in the if block that resolves the pointer from the specified ifindex, so move it there. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-6-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index d3dfc4af9556..d45b63180714 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -3967,6 +3967,8 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, err = netdev_upper_dev_link(remote_dev, dev, extack); if (err) goto unregister; + + dst->remote_dev = remote_dev; } err = rtnl_configure_link(dev, NULL, 0, NULL); @@ -3994,8 +3996,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, } list_add(&vxlan->next, &vn->vxlan_list); - if (remote_dev) - dst->remote_dev = remote_dev; + return 0; unlink: From 094adad91310d9f8f8485251129482f4f3e2c5b3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:34 +0300 Subject: [PATCH 06/15] vxlan: Use a single lock to protect the FDB table Currently, the VXLAN driver stores FDB entries in a hash table with a fixed number of buckets (256). Subsequent patches are going to convert this table to rhashtable with a linked list for entry traversal, as rhashtable is more scalable. In preparation for this conversion, move from a per-bucket spin lock to a single spin lock that protects the entire FDB table. The per-bucket spin locks were introduced by commit fe1e0713bbe8 ("vxlan: Use FDB_HASH_SIZE hash_locks to reduce contention") citing "huge contention when inserting/deleting vxlan_fdbs into the fdb_head". It is not clear from the commit message which code path was holding the spin lock for long periods of time, but the obvious suspect is the FDB cleanup routine (vxlan_cleanup()) that periodically traverses the entire table in order to delete aged-out entries. This will be solved by subsequent patches that will convert the FDB cleanup routine to traverse the linked list of FDB entries using RCU, only acquiring the spin lock when deleting an aged-out entry. The change reduces the size of the VXLAN device structure from 3600 bytes to 2576 bytes. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-7-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 82 +++++++++++------------------ drivers/net/vxlan/vxlan_vnifilter.c | 8 ++- include/net/vxlan.h | 2 +- 3 files changed, 34 insertions(+), 58 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index d45b63180714..b8edd8afda28 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -524,8 +524,8 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, return -EINVAL; vxlan = netdev_priv(dev); + spin_lock_bh(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) { if (f->vni == vni) { list_for_each_entry(rdst, &f->remotes, list) { @@ -537,12 +537,12 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, } } } - spin_unlock_bh(&vxlan->hash_lock[h]); } + spin_unlock_bh(&vxlan->hash_lock); return 0; unlock: - spin_unlock_bh(&vxlan->hash_lock[h]); + spin_unlock_bh(&vxlan->hash_lock); return rc; } EXPORT_SYMBOL_GPL(vxlan_fdb_replay); @@ -558,14 +558,14 @@ void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) return; vxlan = netdev_priv(dev); + spin_lock_bh(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) if (f->vni == vni) list_for_each_entry(rdst, &f->remotes, list) rdst->offloaded = false; - spin_unlock_bh(&vxlan->hash_lock[h]); } + spin_unlock_bh(&vxlan->hash_lock); } EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload); @@ -1248,7 +1248,6 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], __be16 port; __be32 src_vni, vni; u32 ifindex, nhid; - u32 hash_index; int err; if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { @@ -1268,13 +1267,12 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) return -EAFNOSUPPORT; - hash_index = fdb_head_index(vxlan, addr, src_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, port, src_vni, vni, ifindex, ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, nhid, true, extack); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); if (!err) *notified = true; @@ -1325,7 +1323,6 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], union vxlan_addr ip; __be32 src_vni, vni; u32 ifindex, nhid; - u32 hash_index; __be16 port; int err; @@ -1334,11 +1331,10 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], if (err) return err; - hash_index = fdb_head_index(vxlan, addr, src_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); if (!err) *notified = true; @@ -1486,10 +1482,8 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, rdst->remote_ip = *src_ip; vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); } else { - u32 hash_index = fdb_head_index(vxlan, src_mac, vni); - /* learned new entry */ - spin_lock(&vxlan->hash_lock[hash_index]); + spin_lock(&vxlan->hash_lock); /* close off race between vxlan_flush and incoming packets */ if (netif_running(dev)) @@ -1500,7 +1494,7 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, vni, vxlan->default_dst.remote_vni, ifindex, NTF_SELF, 0, true, NULL); - spin_unlock(&vxlan->hash_lock[hash_index]); + spin_unlock(&vxlan->hash_lock); } return SKB_NOT_DROPPED_YET; @@ -2839,10 +2833,10 @@ static void vxlan_cleanup(struct timer_list *t) if (!netif_running(vxlan->dev)) return; + spin_lock(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; - spin_lock(&vxlan->hash_lock[h]); hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); @@ -2864,8 +2858,8 @@ static void vxlan_cleanup(struct timer_list *t) } else if (time_before(timeout, next_timer)) next_timer = timeout; } - spin_unlock(&vxlan->hash_lock[h]); } + spin_unlock(&vxlan->hash_lock); mod_timer(&vxlan->age_timer, next_timer); } @@ -3056,10 +3050,10 @@ static void vxlan_flush(struct vxlan_dev *vxlan, bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc); unsigned int h; + spin_lock_bh(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; - spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); @@ -3079,8 +3073,8 @@ static void vxlan_flush(struct vxlan_dev *vxlan, vxlan_fdb_destroy(vxlan, f, true, true); } - spin_unlock_bh(&vxlan->hash_lock[h]); } + spin_unlock_bh(&vxlan->hash_lock); } static const struct nla_policy vxlan_del_bulk_policy[NDA_MAX + 1] = { @@ -3358,15 +3352,14 @@ static void vxlan_setup(struct net_device *dev) dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; INIT_LIST_HEAD(&vxlan->next); + spin_lock_init(&vxlan->hash_lock); timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); vxlan->dev = dev; - for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_init(&vxlan->hash_lock[h]); + for (h = 0; h < FDB_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vxlan->fdb_head[h]); - } } static void vxlan_ether_setup(struct net_device *dev) @@ -3977,10 +3970,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, /* create an fdb entry for a valid default destination */ if (!vxlan_addr_any(&dst->remote_ip)) { - u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, - dst->remote_vni); - - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, all_zeros_mac, &dst->remote_ip, NUD_REACHABLE | NUD_PERMANENT, @@ -3990,7 +3980,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, dst->remote_vni, dst->remote_ifindex, NTF_SELF, 0, true, extack); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); if (err) goto unlink; } @@ -4420,9 +4410,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], /* handle default dst entry */ if (rem_ip_changed) { - u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni); - - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); if (!vxlan_addr_any(&conf.remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, &conf.remote_ip, @@ -4433,7 +4421,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], conf.remote_ifindex, NTF_SELF, 0, true, extack); if (err) { - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); netdev_adjacent_change_abort(dst->remote_dev, lowerdev, dev); return err; @@ -4447,7 +4435,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], dst->remote_vni, dst->remote_ifindex, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); /* If vni filtering device, also update fdb entries of * all vnis that were using default remote ip @@ -4747,11 +4735,8 @@ vxlan_fdb_offloaded_set(struct net_device *dev, struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst; struct vxlan_fdb *f; - u32 hash_index; - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); - - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) @@ -4767,7 +4752,7 @@ vxlan_fdb_offloaded_set(struct net_device *dev, rdst->offloaded = fdb_info->offloaded; out: - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); } static int @@ -4776,13 +4761,11 @@ vxlan_fdb_external_learn_add(struct net_device *dev, { struct vxlan_dev *vxlan = netdev_priv(dev); struct netlink_ext_ack *extack; - u32 hash_index; int err; - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); extack = switchdev_notifier_info_to_extack(&fdb_info->info); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip, NUD_REACHABLE, NLM_F_CREATE | NLM_F_REPLACE, @@ -4792,7 +4775,7 @@ vxlan_fdb_external_learn_add(struct net_device *dev, fdb_info->remote_ifindex, NTF_USE | NTF_SELF | NTF_EXT_LEARNED, 0, false, extack); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } @@ -4803,11 +4786,9 @@ vxlan_fdb_external_learn_del(struct net_device *dev, { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; - u32 hash_index; int err = 0; - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) @@ -4821,7 +4802,7 @@ vxlan_fdb_external_learn_del(struct net_device *dev, fdb_info->remote_ifindex, false); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } @@ -4870,18 +4851,15 @@ static void vxlan_fdb_nh_flush(struct nexthop *nh) { struct vxlan_fdb *fdb; struct vxlan_dev *vxlan; - u32 hash_index; rcu_read_lock(); list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) { vxlan = rcu_dereference(fdb->vdev); WARN_ON(!vxlan); - hash_index = fdb_head_index(vxlan, fdb->eth_addr, - vxlan->default_dst.remote_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); if (!hlist_unhashed(&fdb->hlist)) vxlan_fdb_destroy(vxlan, fdb, false, false); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); } rcu_read_unlock(); } diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index d0753776d339..336cd0e20309 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -482,11 +482,9 @@ static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, struct netlink_ext_ack *extack) { struct vxlan_rdst *dst = &vxlan->default_dst; - u32 hash_index; int err = 0; - hash_index = fdb_head_index(vxlan, all_zeros_mac, vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); if (remote_ip && !vxlan_addr_any(remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, remote_ip, @@ -498,7 +496,7 @@ static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, dst->remote_ifindex, NTF_SELF, 0, true, extack); if (err) { - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } } @@ -511,7 +509,7 @@ static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, dst->remote_ifindex, true); } - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 2dd23ee2bacd..272e11708a33 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -296,7 +296,7 @@ struct vxlan_dev { struct vxlan_rdst default_dst; /* default destination */ struct timer_list age_timer; - spinlock_t hash_lock[FDB_HASH_SIZE]; + spinlock_t hash_lock; unsigned int addrcnt; struct gro_cells gro_cells; From 8d45673d2d2e59d03e108c569a3e8c031aa534c8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:35 +0300 Subject: [PATCH 07/15] vxlan: Add a linked list of FDB entries Currently, FDB entries are stored in a hash table with a fixed number of buckets. The table is used for both lookups and entry traversal. Subsequent patches will convert the table to rhashtable which is not suitable for entry traversal. In preparation for this conversion, add FDB entries to a linked list. Subsequent patches will convert the driver to use this list when traversing entries during dump, flush, etc. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-8-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 3 +++ drivers/net/vxlan/vxlan_private.h | 1 + include/net/vxlan.h | 1 + 3 files changed, 5 insertions(+) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index b8edd8afda28..511c24e29d45 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -908,6 +908,7 @@ int vxlan_fdb_create(struct vxlan_dev *vxlan, ++vxlan->addrcnt; hlist_add_head_rcu(&f->hlist, vxlan_fdb_head(vxlan, mac, src_vni)); + hlist_add_head_rcu(&f->fdb_node, &vxlan->fdb_list); *fdb = f; @@ -962,6 +963,7 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, swdev_notify, NULL); } + hlist_del_init_rcu(&f->fdb_node); hlist_del_rcu(&f->hlist); list_del_rcu(&f->nh_list); call_rcu(&f->rcu, vxlan_fdb_free); @@ -3360,6 +3362,7 @@ static void vxlan_setup(struct net_device *dev) for (h = 0; h < FDB_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vxlan->fdb_head[h]); + INIT_HLIST_HEAD(&vxlan->fdb_list); } static void vxlan_ether_setup(struct net_device *dev) diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h index 76a351a997d5..078702ec604d 100644 --- a/drivers/net/vxlan/vxlan_private.h +++ b/drivers/net/vxlan/vxlan_private.h @@ -36,6 +36,7 @@ struct vxlan_fdb { __be32 vni; u16 flags; /* see ndm_flags and below */ struct list_head nh_list; + struct hlist_node fdb_node; struct nexthop __rcu *nh; struct vxlan_dev __rcu *vdev; }; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 272e11708a33..96a6c6f45c2e 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -307,6 +307,7 @@ struct vxlan_dev { struct hlist_head fdb_head[FDB_HASH_SIZE]; struct rhashtable mdb_tbl; + struct hlist_head fdb_list; struct hlist_head mdb_list; unsigned int mdb_seq; }; From 7aa0dc750d4bca9545c89803e3779bdff4c58b58 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:36 +0300 Subject: [PATCH 08/15] vxlan: Use linked list to traverse FDB entries In preparation for removing the fixed size hash table, convert FDB entry traversal to use the newly added FDB linked list. No functional changes intended. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-9-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 174 ++++++++++++++------------------- 1 file changed, 76 insertions(+), 98 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 511c24e29d45..f9840a4b6e44 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -517,7 +517,6 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; - unsigned int h; int rc = 0; if (!netif_is_vxlan(dev)) @@ -525,16 +524,13 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, vxlan = netdev_priv(dev); spin_lock_bh(&vxlan->hash_lock); - for (h = 0; h < FDB_HASH_SIZE; ++h) { - hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) { - if (f->vni == vni) { - list_for_each_entry(rdst, &f->remotes, list) { - rc = vxlan_fdb_notify_one(nb, vxlan, - f, rdst, - extack); - if (rc) - goto unlock; - } + hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { + if (f->vni == vni) { + list_for_each_entry(rdst, &f->remotes, list) { + rc = vxlan_fdb_notify_one(nb, vxlan, f, rdst, + extack); + if (rc) + goto unlock; } } } @@ -552,18 +548,17 @@ void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; - unsigned int h; if (!netif_is_vxlan(dev)) return; vxlan = netdev_priv(dev); spin_lock_bh(&vxlan->hash_lock); - for (h = 0; h < FDB_HASH_SIZE; ++h) { - hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) - if (f->vni == vni) - list_for_each_entry(rdst, &f->remotes, list) - rdst->offloaded = false; + hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { + if (f->vni == vni) { + list_for_each_entry(rdst, &f->remotes, list) + rdst->offloaded = false; + } } spin_unlock_bh(&vxlan->hash_lock); @@ -1351,52 +1346,46 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, { struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct vxlan_dev *vxlan = netdev_priv(dev); - unsigned int h; + struct vxlan_fdb *f; int err = 0; - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct vxlan_fdb *f; + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { + struct vxlan_rdst *rd; - rcu_read_lock(); - hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { - struct vxlan_rdst *rd; - - if (rcu_access_pointer(f->nh)) { - if (*idx < ctx->fdb_idx) - goto skip_nh; - err = vxlan_fdb_info(skb, vxlan, f, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI, NULL); - if (err < 0) { - rcu_read_unlock(); - goto out; - } + if (rcu_access_pointer(f->nh)) { + if (*idx < ctx->fdb_idx) + goto skip_nh; + err = vxlan_fdb_info(skb, vxlan, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, NLM_F_MULTI, NULL); + if (err < 0) { + rcu_read_unlock(); + goto out; + } skip_nh: - *idx += 1; - continue; - } - - list_for_each_entry_rcu(rd, &f->remotes, list) { - if (*idx < ctx->fdb_idx) - goto skip; - - err = vxlan_fdb_info(skb, vxlan, f, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI, rd); - if (err < 0) { - rcu_read_unlock(); - goto out; - } -skip: - *idx += 1; - } + *idx += 1; + continue; + } + + list_for_each_entry_rcu(rd, &f->remotes, list) { + if (*idx < ctx->fdb_idx) + goto skip; + + err = vxlan_fdb_info(skb, vxlan, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, NLM_F_MULTI, rd); + if (err < 0) { + rcu_read_unlock(); + goto out; + } +skip: + *idx += 1; } - rcu_read_unlock(); } + rcu_read_unlock(); out: return err; } @@ -2830,35 +2819,30 @@ static void vxlan_cleanup(struct timer_list *t) { struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer); unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; - unsigned int h; + struct hlist_node *n; + struct vxlan_fdb *f; if (!netif_running(vxlan->dev)) return; spin_lock(&vxlan->hash_lock); - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct hlist_node *p, *n; + hlist_for_each_entry_safe(f, n, &vxlan->fdb_list, fdb_node) { + unsigned long timeout; - hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { - struct vxlan_fdb *f - = container_of(p, struct vxlan_fdb, hlist); - unsigned long timeout; + if (f->state & (NUD_PERMANENT | NUD_NOARP)) + continue; - if (f->state & (NUD_PERMANENT | NUD_NOARP)) - continue; + if (f->flags & NTF_EXT_LEARNED) + continue; - if (f->flags & NTF_EXT_LEARNED) - continue; - - timeout = READ_ONCE(f->updated) + vxlan->cfg.age_interval * HZ; - if (time_before_eq(timeout, jiffies)) { - netdev_dbg(vxlan->dev, - "garbage collect %pM\n", - f->eth_addr); - f->state = NUD_STALE; - vxlan_fdb_destroy(vxlan, f, true, true); - } else if (time_before(timeout, next_timer)) - next_timer = timeout; + timeout = READ_ONCE(f->updated) + vxlan->cfg.age_interval * HZ; + if (time_before_eq(timeout, jiffies)) { + netdev_dbg(vxlan->dev, "garbage collect %pM\n", + f->eth_addr); + f->state = NUD_STALE; + vxlan_fdb_destroy(vxlan, f, true, true); + } else if (time_before(timeout, next_timer)) { + next_timer = timeout; } } spin_unlock(&vxlan->hash_lock); @@ -3050,31 +3034,25 @@ static void vxlan_flush(struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc) { bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc); - unsigned int h; + struct hlist_node *n; + struct vxlan_fdb *f; spin_lock_bh(&vxlan->hash_lock); - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct hlist_node *p, *n; + hlist_for_each_entry_safe(f, n, &vxlan->fdb_list, fdb_node) { + if (!vxlan_fdb_flush_matches(f, vxlan, desc)) + continue; - hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { - struct vxlan_fdb *f - = container_of(p, struct vxlan_fdb, hlist); + if (match_remotes) { + bool destroy_fdb = false; - if (!vxlan_fdb_flush_matches(f, vxlan, desc)) + vxlan_fdb_flush_match_remotes(f, vxlan, desc, + &destroy_fdb); + + if (!destroy_fdb) continue; - - if (match_remotes) { - bool destroy_fdb = false; - - vxlan_fdb_flush_match_remotes(f, vxlan, desc, - &destroy_fdb); - - if (!destroy_fdb) - continue; - } - - vxlan_fdb_destroy(vxlan, f, true, true); } + + vxlan_fdb_destroy(vxlan, f, true, true); } spin_unlock_bh(&vxlan->hash_lock); } @@ -4860,7 +4838,7 @@ static void vxlan_fdb_nh_flush(struct nexthop *nh) vxlan = rcu_dereference(fdb->vdev); WARN_ON(!vxlan); spin_lock_bh(&vxlan->hash_lock); - if (!hlist_unhashed(&fdb->hlist)) + if (!hlist_unhashed(&fdb->fdb_node)) vxlan_fdb_destroy(vxlan, fdb, false, false); spin_unlock_bh(&vxlan->hash_lock); } From a6d04f8937e3a721c3a3578fc18b28cb59b15f42 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:37 +0300 Subject: [PATCH 09/15] vxlan: Convert FDB garbage collection to RCU Instead of holding the FDB hash lock when traversing the FDB linked list during garbage collection, use RCU and only acquire the lock for entries that need to be removed (aged out). Avoid races by using hlist_unhashed() to check that the entry has not been removed from the list by another thread. Note that vxlan_fdb_destroy() uses hlist_del_init_rcu() to remove an entry from the list which should cause list_unhashed() to return true. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-10-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index f9840a4b6e44..c3511a43ce99 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2819,14 +2819,13 @@ static void vxlan_cleanup(struct timer_list *t) { struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer); unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; - struct hlist_node *n; struct vxlan_fdb *f; if (!netif_running(vxlan->dev)) return; - spin_lock(&vxlan->hash_lock); - hlist_for_each_entry_safe(f, n, &vxlan->fdb_list, fdb_node) { + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { unsigned long timeout; if (f->state & (NUD_PERMANENT | NUD_NOARP)) @@ -2837,15 +2836,19 @@ static void vxlan_cleanup(struct timer_list *t) timeout = READ_ONCE(f->updated) + vxlan->cfg.age_interval * HZ; if (time_before_eq(timeout, jiffies)) { - netdev_dbg(vxlan->dev, "garbage collect %pM\n", - f->eth_addr); - f->state = NUD_STALE; - vxlan_fdb_destroy(vxlan, f, true, true); + spin_lock(&vxlan->hash_lock); + if (!hlist_unhashed(&f->fdb_node)) { + netdev_dbg(vxlan->dev, "garbage collect %pM\n", + f->eth_addr); + f->state = NUD_STALE; + vxlan_fdb_destroy(vxlan, f, true, true); + } + spin_unlock(&vxlan->hash_lock); } else if (time_before(timeout, next_timer)) { next_timer = timeout; } } - spin_unlock(&vxlan->hash_lock); + rcu_read_unlock(); mod_timer(&vxlan->age_timer, next_timer); } From 54f45187b635f8c4c1e554a2ed347bba7d27dfbd Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:38 +0300 Subject: [PATCH 10/15] vxlan: Convert FDB flushing to RCU Instead of holding the FDB hash lock when traversing the FDB linked list during flushing, use RCU and only acquire the lock for entries that need to be flushed. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-11-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index c3511a43ce99..762dde70d9e9 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -3037,14 +3037,17 @@ static void vxlan_flush(struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc) { bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc); - struct hlist_node *n; struct vxlan_fdb *f; - spin_lock_bh(&vxlan->hash_lock); - hlist_for_each_entry_safe(f, n, &vxlan->fdb_list, fdb_node) { + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { if (!vxlan_fdb_flush_matches(f, vxlan, desc)) continue; + spin_lock_bh(&vxlan->hash_lock); + if (hlist_unhashed(&f->fdb_node)) + goto unlock; + if (match_remotes) { bool destroy_fdb = false; @@ -3052,12 +3055,14 @@ static void vxlan_flush(struct vxlan_dev *vxlan, &destroy_fdb); if (!destroy_fdb) - continue; + goto unlock; } vxlan_fdb_destroy(vxlan, f, true, true); +unlock: + spin_unlock_bh(&vxlan->hash_lock); } - spin_unlock_bh(&vxlan->hash_lock); + rcu_read_unlock(); } static const struct nla_policy vxlan_del_bulk_policy[NDA_MAX + 1] = { From 5cde39ea38813ebb5bf07922a3ba60871edebf99 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:39 +0300 Subject: [PATCH 11/15] vxlan: Rename FDB Tx lookup function vxlan_find_mac() is only expected to be called from the Tx path as it updates the 'used' timestamp. Rename it to vxlan_find_mac_tx() to reflect that and to avoid incorrect updates of this timestamp like those addressed by commit 9722f834fe9a ("vxlan: Avoid unnecessary updates to FDB 'used' time"). No functional changes intended. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-12-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 762dde70d9e9..397b1691ab06 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -429,8 +429,8 @@ static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, return NULL; } -static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, - const u8 *mac, __be32 vni) +static struct vxlan_fdb *vxlan_find_mac_tx(struct vxlan_dev *vxlan, + const u8 *mac, __be32 vni) { struct vxlan_fdb *f; @@ -1897,7 +1897,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) } rcu_read_lock(); - f = vxlan_find_mac(vxlan, n->ha, vni); + f = vxlan_find_mac_tx(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); @@ -2063,7 +2063,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; } - f = vxlan_find_mac(vxlan, n->ha, vni); + f = vxlan_find_mac_tx(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); @@ -2762,7 +2762,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) eth = eth_hdr(skb); rcu_read_lock(); - f = vxlan_find_mac(vxlan, eth->h_dest, vni); + f = vxlan_find_mac_tx(vxlan, eth->h_dest, vni); did_rsc = false; if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) && @@ -2770,11 +2770,11 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) ntohs(eth->h_proto) == ETH_P_IPV6)) { did_rsc = route_shortcircuit(dev, skb); if (did_rsc) - f = vxlan_find_mac(vxlan, eth->h_dest, vni); + f = vxlan_find_mac_tx(vxlan, eth->h_dest, vni); } if (f == NULL) { - f = vxlan_find_mac(vxlan, all_zeros_mac, vni); + f = vxlan_find_mac_tx(vxlan, all_zeros_mac, vni); if (f == NULL) { if ((vxlan->cfg.flags & VXLAN_F_L2MISS) && !is_multicast_ether_addr(eth->h_dest)) From ebe6420674551751e6a888180ebda76f1dddf911 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:40 +0300 Subject: [PATCH 12/15] vxlan: Create wrappers for FDB lookup __vxlan_find_mac() is called from both the data path (e.g., during learning) and the control path (e.g., when replacing an entry). The function is missing lockdep annotations to make sure that the FDB hash lock is held during FDB updates. Rename __vxlan_find_mac() to vxlan_find_mac_rcu() to reflect the fact that it should be called from an RCU read-side critical section and call it from vxlan_find_mac() which checks that the FDB hash lock is held. Change callers to invoke the appropriate function. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-13-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 397b1691ab06..2846c8c5234e 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -409,8 +409,8 @@ static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, } /* Look up Ethernet address in forwarding table */ -static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, - const u8 *mac, __be32 vni) +static struct vxlan_fdb *vxlan_find_mac_rcu(struct vxlan_dev *vxlan, + const u8 *mac, __be32 vni) { struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni); struct vxlan_fdb *f; @@ -434,7 +434,7 @@ static struct vxlan_fdb *vxlan_find_mac_tx(struct vxlan_dev *vxlan, { struct vxlan_fdb *f; - f = __vxlan_find_mac(vxlan, mac, vni); + f = vxlan_find_mac_rcu(vxlan, mac, vni); if (f) { unsigned long now = jiffies; @@ -445,6 +445,20 @@ static struct vxlan_fdb *vxlan_find_mac_tx(struct vxlan_dev *vxlan, return f; } +static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, + const u8 *mac, __be32 vni) +{ + struct vxlan_fdb *f; + + lockdep_assert_held_once(&vxlan->hash_lock); + + rcu_read_lock(); + f = vxlan_find_mac_rcu(vxlan, mac, vni); + rcu_read_unlock(); + + return f; +} + /* caller should hold vxlan->hash_lock */ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, @@ -480,7 +494,7 @@ int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, rcu_read_lock(); - f = __vxlan_find_mac(vxlan, eth_addr, vni); + f = vxlan_find_mac_rcu(vxlan, eth_addr, vni); if (!f) { rc = -ENOENT; goto out; @@ -1117,7 +1131,7 @@ int vxlan_fdb_update(struct vxlan_dev *vxlan, { struct vxlan_fdb *f; - f = __vxlan_find_mac(vxlan, mac, src_vni); + f = vxlan_find_mac(vxlan, mac, src_vni); if (f) { if (flags & NLM_F_EXCL) { netdev_dbg(vxlan->dev, @@ -1286,7 +1300,7 @@ int __vxlan_fdb_delete(struct vxlan_dev *vxlan, struct vxlan_fdb *f; int err = -ENOENT; - f = __vxlan_find_mac(vxlan, addr, src_vni); + f = vxlan_find_mac(vxlan, addr, src_vni); if (!f) return err; @@ -1409,7 +1423,7 @@ static int vxlan_fdb_get(struct sk_buff *skb, rcu_read_lock(); - f = __vxlan_find_mac(vxlan, addr, vni); + f = vxlan_find_mac_rcu(vxlan, addr, vni); if (!f) { NL_SET_ERR_MSG(extack, "Fdb entry not found"); err = -ENOENT; @@ -1445,7 +1459,7 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, ifindex = src_ifindex; #endif - f = __vxlan_find_mac(vxlan, src_mac, vni); + f = vxlan_find_mac_rcu(vxlan, src_mac, vni); if (likely(f)) { struct vxlan_rdst *rdst = first_remote_rcu(f); unsigned long now = jiffies; @@ -4727,7 +4741,7 @@ vxlan_fdb_offloaded_set(struct net_device *dev, spin_lock_bh(&vxlan->hash_lock); - f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); + f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) goto out; @@ -4779,7 +4793,7 @@ vxlan_fdb_external_learn_del(struct net_device *dev, spin_lock_bh(&vxlan->hash_lock); - f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); + f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) err = -ENOENT; else if (f->flags & NTF_EXT_LEARNED) From 20c76dadc783759fd3819d289c72be590660cc8b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:41 +0300 Subject: [PATCH 13/15] vxlan: Do not treat dst cache initialization errors as fatal FDB entries are allocated in an atomic context as they can be added from the data path when learning is enabled. After converting the FDB hash table to rhashtable, the insertion rate will be much higher (*) which will entail a much higher rate of per-CPU allocations via dst_cache_init(). When adding a large number of entries (e.g., 256k) in a batch, a small percentage (< 0.02%) of these per-CPU allocations will fail [1]. This does not happen with the current code since the insertion rate is low enough to give the per-CPU allocator a chance to asynchronously create new chunks of per-CPU memory. Given that: a. Only a small percentage of these per-CPU allocations fail. b. The scenario where this happens might not be the most realistic one. c. The driver can work correctly without dst caches. The dst_cache_*() APIs first check that the dst cache was properly initialized. d. The dst caches are not always used (e.g., 'tos inherit'). It seems reasonable to not treat these allocation failures as fatal. Therefore, do not bail when dst_cache_init() fails and suppress warnings by specifying '__GFP_NOWARN'. [1] percpu: allocation failed, size=40 align=8 atomic=1, atomic alloc failed, no space left (*) 97% reduction in average latency of vxlan_fdb_update() when adding 256k entries in a batch. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-14-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 2846c8c5234e..5c0752161529 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -619,10 +619,10 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, if (rd == NULL) return -ENOMEM; - if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) { - kfree(rd); - return -ENOMEM; - } + /* The driver can work correctly without a dst cache, so do not treat + * dst cache initialization errors as fatal. + */ + dst_cache_init(&rd->dst_cache, GFP_ATOMIC | __GFP_NOWARN); rd->remote_ip = *ip; rd->remote_port = port; From f13f3b4157dd9a05a5b651dfd15a4efa2df06f67 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:42 +0300 Subject: [PATCH 14/15] vxlan: Introduce FDB key structure In preparation for converting the FDB table to rhashtable, introduce a key structure that includes the MAC address and source VNI. No functional changes intended. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-15-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 44 ++++++++++++++++--------------- drivers/net/vxlan/vxlan_private.h | 8 ++++-- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 5c0752161529..8e359cf8dbbd 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -186,7 +186,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, } else if (nh) { ndm->ndm_family = nh_family; } - send_eth = !is_zero_ether_addr(fdb->eth_addr); + send_eth = !is_zero_ether_addr(fdb->key.eth_addr); } else ndm->ndm_family = AF_BRIDGE; ndm->ndm_state = fdb->state; @@ -201,7 +201,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, peernet2id(dev_net(vxlan->dev), vxlan->net))) goto nla_put_failure; - if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) + if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.eth_addr)) goto nla_put_failure; if (nh) { if (nla_put_u32(skb, NDA_NH_ID, nh_id)) @@ -223,9 +223,9 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, goto nla_put_failure; } - if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni && + if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->key.vni && nla_put_u32(skb, NDA_SRC_VNI, - be32_to_cpu(fdb->vni))) + be32_to_cpu(fdb->key.vni))) goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - READ_ONCE(fdb->used)); @@ -293,8 +293,8 @@ static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan, fdb_info->remote_port = rd->remote_port; fdb_info->remote_vni = rd->remote_vni; fdb_info->remote_ifindex = rd->remote_ifindex; - memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN); - fdb_info->vni = fdb->vni; + memcpy(fdb_info->eth_addr, fdb->key.eth_addr, ETH_ALEN); + fdb_info->vni = fdb->key.vni; fdb_info->offloaded = rd->offloaded; fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER; } @@ -366,7 +366,7 @@ static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) }; struct vxlan_rdst remote = { }; - memcpy(f.eth_addr, eth_addr, ETH_ALEN); + memcpy(f.key.eth_addr, eth_addr, ETH_ALEN); vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } @@ -416,9 +416,9 @@ static struct vxlan_fdb *vxlan_find_mac_rcu(struct vxlan_dev *vxlan, struct vxlan_fdb *f; hlist_for_each_entry_rcu(f, head, hlist) { - if (ether_addr_equal(mac, f->eth_addr)) { + if (ether_addr_equal(mac, f->key.eth_addr)) { if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { - if (vni == f->vni) + if (vni == f->key.vni) return f; } else { return f; @@ -539,7 +539,7 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, spin_lock_bh(&vxlan->hash_lock); hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { - if (f->vni == vni) { + if (f->key.vni == vni) { list_for_each_entry(rdst, &f->remotes, list) { rc = vxlan_fdb_notify_one(nb, vxlan, f, rdst, extack); @@ -569,7 +569,7 @@ void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) spin_lock_bh(&vxlan->hash_lock); hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { - if (f->vni == vni) { + if (f->key.vni == vni) { list_for_each_entry(rdst, &f->remotes, list) rdst->offloaded = false; } @@ -812,15 +812,16 @@ static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac, f = kmalloc(sizeof(*f), GFP_ATOMIC); if (!f) return NULL; + memset(&f->key, 0, sizeof(f->key)); f->state = state; f->flags = ndm_flags; f->updated = f->used = jiffies; - f->vni = src_vni; + f->key.vni = src_vni; f->nh = NULL; RCU_INIT_POINTER(f->vdev, vxlan); INIT_LIST_HEAD(&f->nh_list); INIT_LIST_HEAD(&f->remotes); - memcpy(f->eth_addr, mac, ETH_ALEN); + memcpy(f->key.eth_addr, mac, ETH_ALEN); return f; } @@ -959,7 +960,7 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, { struct vxlan_rdst *rd; - netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr); + netdev_dbg(vxlan->dev, "delete %pM\n", f->key.eth_addr); --vxlan->addrcnt; if (do_notify) { @@ -1031,8 +1032,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, if ((flags & NLM_F_REPLACE)) { /* Only change unicasts */ - if (!(is_multicast_ether_addr(f->eth_addr) || - is_zero_ether_addr(f->eth_addr))) { + if (!(is_multicast_ether_addr(f->key.eth_addr) || + is_zero_ether_addr(f->key.eth_addr))) { if (nhid) { rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); if (rc < 0) @@ -1048,8 +1049,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, } } if ((flags & NLM_F_APPEND) && - (is_multicast_ether_addr(f->eth_addr) || - is_zero_ether_addr(f->eth_addr))) { + (is_multicast_ether_addr(f->key.eth_addr) || + is_zero_ether_addr(f->key.eth_addr))) { rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); if (rc < 0) @@ -2853,7 +2854,7 @@ static void vxlan_cleanup(struct timer_list *t) spin_lock(&vxlan->hash_lock); if (!hlist_unhashed(&f->fdb_node)) { netdev_dbg(vxlan->dev, "garbage collect %pM\n", - f->eth_addr); + f->key.eth_addr); f->state = NUD_STALE; vxlan_fdb_destroy(vxlan, f, true, true); } @@ -2972,7 +2973,8 @@ struct vxlan_fdb_flush_desc { static bool vxlan_fdb_is_default_entry(const struct vxlan_fdb *f, const struct vxlan_dev *vxlan) { - return is_zero_ether_addr(f->eth_addr) && f->vni == vxlan->cfg.vni; + return is_zero_ether_addr(f->key.eth_addr) && + f->key.vni == vxlan->cfg.vni; } static bool vxlan_fdb_nhid_matches(const struct vxlan_fdb *f, u32 nhid) @@ -2995,7 +2997,7 @@ static bool vxlan_fdb_flush_matches(const struct vxlan_fdb *f, if (desc->ignore_default_entry && vxlan_fdb_is_default_entry(f, vxlan)) return false; - if (desc->src_vni && f->vni != desc->src_vni) + if (desc->src_vni && f->key.vni != desc->src_vni) return false; if (desc->nhid && !vxlan_fdb_nhid_matches(f, desc->nhid)) diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h index 078702ec604d..3ca19e7167c9 100644 --- a/drivers/net/vxlan/vxlan_private.h +++ b/drivers/net/vxlan/vxlan_private.h @@ -24,6 +24,11 @@ struct vxlan_net { struct notifier_block nexthop_notifier_block; }; +struct vxlan_fdb_key { + u8 eth_addr[ETH_ALEN]; + __be32 vni; +}; + /* Forwarding table entry */ struct vxlan_fdb { struct hlist_node hlist; /* linked list of entries */ @@ -31,9 +36,8 @@ struct vxlan_fdb { unsigned long updated; /* jiffies */ unsigned long used; struct list_head remotes; - u8 eth_addr[ETH_ALEN]; + struct vxlan_fdb_key key; u16 state; /* see ndm_state */ - __be32 vni; u16 flags; /* see ndm_flags and below */ struct list_head nh_list; struct hlist_node fdb_node; From 1f763fa808e92a67feea8364ef80ca3065d74702 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:43 +0300 Subject: [PATCH 15/15] vxlan: Convert FDB table to rhashtable FDB entries are currently stored in a hash table with a fixed number of buckets (256), resulting in performance degradation as the number of entries grows. Solve this by converting the driver to use rhashtable which maintains more or less constant performance regardless of the number of entries. Measured transmitted packets per second using a single pktgen thread with varying number of entries when the transmitted packet always hits the default entry (worst case): Number of entries | Improvement ------------------|------------ 1k | +1.12% 4k | +9.22% 16k | +55% 64k | +585% 256k | +2460% In addition, the change reduces the size of the VXLAN device structure from 2584 bytes to 672 bytes. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-16-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 102 ++++++++++++------------------ drivers/net/vxlan/vxlan_private.h | 2 +- include/net/vxlan.h | 2 +- 3 files changed, 43 insertions(+), 63 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 8e359cf8dbbd..a56d7239b127 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -63,8 +64,12 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan); static void vxlan_vs_del_dev(struct vxlan_dev *vxlan); -/* salt for hash table */ -static u32 vxlan_salt __read_mostly; +static const struct rhashtable_params vxlan_fdb_rht_params = { + .head_offset = offsetof(struct vxlan_fdb, rhnode), + .key_offset = offsetof(struct vxlan_fdb, key), + .key_len = sizeof(struct vxlan_fdb_key), + .automatic_shrinking = true, +}; static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) { @@ -371,62 +376,21 @@ static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } -/* Hash Ethernet address */ -static u32 eth_hash(const unsigned char *addr) -{ - u64 value = get_unaligned((u64 *)addr); - - /* only want 6 bytes */ -#ifdef __BIG_ENDIAN - value >>= 16; -#else - value <<= 16; -#endif - return hash_64(value, FDB_HASH_BITS); -} - -u32 eth_vni_hash(const unsigned char *addr, __be32 vni) -{ - /* use 1 byte of OUI and 3 bytes of NIC */ - u32 key = get_unaligned((u32 *)(addr + 2)); - - return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1); -} - -u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) -{ - if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) - return eth_vni_hash(mac, vni); - else - return eth_hash(mac); -} - -/* Hash chain to use given mac address */ -static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, - const u8 *mac, __be32 vni) -{ - return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)]; -} - /* Look up Ethernet address in forwarding table */ static struct vxlan_fdb *vxlan_find_mac_rcu(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { - struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni); - struct vxlan_fdb *f; + struct vxlan_fdb_key key; - hlist_for_each_entry_rcu(f, head, hlist) { - if (ether_addr_equal(mac, f->key.eth_addr)) { - if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { - if (vni == f->key.vni) - return f; - } else { - return f; - } - } - } + memset(&key, 0, sizeof(key)); + memcpy(key.eth_addr, mac, sizeof(key.eth_addr)); + if (!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) + key.vni = vxlan->default_dst.remote_vni; + else + key.vni = vni; - return NULL; + return rhashtable_lookup(&vxlan->fdb_hash_tbl, &key, + vxlan_fdb_rht_params); } static struct vxlan_fdb *vxlan_find_mac_tx(struct vxlan_dev *vxlan, @@ -915,15 +879,27 @@ int vxlan_fdb_create(struct vxlan_dev *vxlan, if (rc < 0) goto errout; + rc = rhashtable_lookup_insert_fast(&vxlan->fdb_hash_tbl, &f->rhnode, + vxlan_fdb_rht_params); + if (rc) + goto destroy_remote; + ++vxlan->addrcnt; - hlist_add_head_rcu(&f->hlist, - vxlan_fdb_head(vxlan, mac, src_vni)); hlist_add_head_rcu(&f->fdb_node, &vxlan->fdb_list); *fdb = f; return 0; +destroy_remote: + if (rcu_access_pointer(f->nh)) { + list_del_rcu(&f->nh_list); + nexthop_put(rtnl_dereference(f->nh)); + } else { + list_del(&rd->list); + dst_cache_destroy(&rd->dst_cache); + kfree(rd); + } errout: kfree(f); return rc; @@ -974,7 +950,8 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, } hlist_del_init_rcu(&f->fdb_node); - hlist_del_rcu(&f->hlist); + rhashtable_remove_fast(&vxlan->fdb_hash_tbl, &f->rhnode, + vxlan_fdb_rht_params); list_del_rcu(&f->nh_list); call_rcu(&f->rcu, vxlan_fdb_free); } @@ -2898,10 +2875,14 @@ static int vxlan_init(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); int err; + err = rhashtable_init(&vxlan->fdb_hash_tbl, &vxlan_fdb_rht_params); + if (err) + return err; + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { err = vxlan_vnigroup_init(vxlan); if (err) - return err; + goto err_rhashtable_destroy; } err = gro_cells_init(&vxlan->gro_cells, dev); @@ -2920,6 +2901,8 @@ static int vxlan_init(struct net_device *dev) err_vnigroup_uninit: if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); +err_rhashtable_destroy: + rhashtable_destroy(&vxlan->fdb_hash_tbl); return err; } @@ -2933,6 +2916,8 @@ static void vxlan_uninit(struct net_device *dev) vxlan_vnigroup_uninit(vxlan); gro_cells_destroy(&vxlan->gro_cells); + + rhashtable_destroy(&vxlan->fdb_hash_tbl); } /* Start ageing timer and join group when device is brought up */ @@ -3329,7 +3314,6 @@ static void vxlan_offload_rx_ports(struct net_device *dev, bool push) static void vxlan_setup(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - unsigned int h; eth_hw_addr_random(dev); ether_setup(dev); @@ -3362,8 +3346,6 @@ static void vxlan_setup(struct net_device *dev) vxlan->dev = dev; - for (h = 0; h < FDB_HASH_SIZE; ++h) - INIT_HLIST_HEAD(&vxlan->fdb_head[h]); INIT_HLIST_HEAD(&vxlan->fdb_list); } @@ -4944,8 +4926,6 @@ static int __init vxlan_init_module(void) { int rc; - get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); - rc = register_pernet_subsys(&vxlan_net_ops); if (rc) goto out1; diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h index 3ca19e7167c9..d328aed9feef 100644 --- a/drivers/net/vxlan/vxlan_private.h +++ b/drivers/net/vxlan/vxlan_private.h @@ -31,7 +31,7 @@ struct vxlan_fdb_key { /* Forwarding table entry */ struct vxlan_fdb { - struct hlist_node hlist; /* linked list of entries */ + struct rhash_head rhnode; struct rcu_head rcu; unsigned long updated; /* jiffies */ unsigned long used; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 96a6c6f45c2e..e2f7ca045d3e 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -304,7 +304,7 @@ struct vxlan_dev { struct vxlan_vni_group __rcu *vnigrp; - struct hlist_head fdb_head[FDB_HASH_SIZE]; + struct rhashtable fdb_hash_tbl; struct rhashtable mdb_tbl; struct hlist_head fdb_list;