From 8a0f62fdeb9ea66ad3d0e959c7c4addbabeac1be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2024 13:47:17 +0000 Subject: [PATCH 1/4] ipv4: remove fib_devindex_hashfn() fib_devindex_hashfn() converts a 32bit ifindex value to a 8bit hash. It makes no sense doing this from fib_info_hashfn() and fib_find_info_nh(). It is better to keep as many bits as possible to let fib_info_hashfn_result() have better spread. Only fib_info_devhash_bucket() needs to make this operation, we can 'inline' trivial fib_devindex_hashfn() in it. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241004134720.579244-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1a847ba40458..1219d1b32591 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -322,17 +322,12 @@ static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) return 0; } -static inline unsigned int fib_devindex_hashfn(unsigned int val) -{ - return hash_32(val, DEVINDEX_HASHBITS); -} - static struct hlist_head * fib_info_devhash_bucket(const struct net_device *dev) { u32 val = net_hash_mix(dev_net(dev)) ^ dev->ifindex; - return &fib_info_devhash[fib_devindex_hashfn(val)]; + return &fib_info_devhash[hash_32(val, DEVINDEX_HASHBITS)]; } static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, @@ -362,10 +357,10 @@ static inline unsigned int fib_info_hashfn(struct fib_info *fi) fi->fib_priority); if (fi->nh) { - val ^= fib_devindex_hashfn(fi->nh->id); + val ^= fi->nh->id; } else { for_nexthops(fi) { - val ^= fib_devindex_hashfn(nh->fib_nh_oif); + val ^= nh->fib_nh_oif; } endfor_nexthops(fi) } @@ -380,7 +375,7 @@ static struct fib_info *fib_find_info_nh(struct net *net, struct fib_info *fi; unsigned int hash; - hash = fib_info_hashfn_1(fib_devindex_hashfn(cfg->fc_nh_id), + hash = fib_info_hashfn_1(cfg->fc_nh_id, cfg->fc_protocol, cfg->fc_scope, (__force u32)cfg->fc_prefsrc, cfg->fc_priority); From fc38b28365e5f1396209d2878a34065468765087 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2024 13:47:18 +0000 Subject: [PATCH 2/4] ipv4: use rcu in ip_fib_check_default() fib_info_devhash[] is not resized in fib_info_hash_move(). fib_nh structs are already freed after an rcu grace period. This will allow to remove fib_info_lock in the following patch. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241004134720.579244-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1219d1b32591..e0ffb4ffd95d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -275,7 +275,7 @@ void fib_release_info(struct fib_info *fi) change_nexthops(fi) { if (!nexthop_nh->fib_nh_dev) continue; - hlist_del(&nexthop_nh->nh_hash); + hlist_del_rcu(&nexthop_nh->nh_hash); } endfor_nexthops(fi) } /* Paired with READ_ONCE() from fib_table_lookup() */ @@ -431,28 +431,23 @@ static struct fib_info *fib_find_info(struct fib_info *nfi) } /* Check, that the gateway is already configured. - * Used only by redirect accept routine. + * Used only by redirect accept routine, under rcu_read_lock(); */ int ip_fib_check_default(__be32 gw, struct net_device *dev) { struct hlist_head *head; struct fib_nh *nh; - spin_lock(&fib_info_lock); - head = fib_info_devhash_bucket(dev); - hlist_for_each_entry(nh, head, nh_hash) { + hlist_for_each_entry_rcu(nh, head, nh_hash) { if (nh->fib_nh_dev == dev && nh->fib_nh_gw4 == gw && !(nh->fib_nh_flags & RTNH_F_DEAD)) { - spin_unlock(&fib_info_lock); return 0; } } - spin_unlock(&fib_info_lock); - return -1; } @@ -1606,7 +1601,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, if (!nexthop_nh->fib_nh_dev) continue; head = fib_info_devhash_bucket(nexthop_nh->fib_nh_dev); - hlist_add_head(&nexthop_nh->nh_hash, head); + hlist_add_head_rcu(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) } spin_unlock_bh(&fib_info_lock); From 143ca845ec0c625c410768c36e1a949ef4ed1915 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2024 13:47:19 +0000 Subject: [PATCH 3/4] ipv4: remove fib_info_lock After the prior patch, fib_info_lock became redundant because all of its users are holding RTNL. BH protection is not needed. Remove the READ_ONCE()/WRITE_ONCE() annotations around fib_info_cnt, since it is protected by RTNL. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241004134720.579244-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e0ffb4ffd95d..ece779bfb8f6 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -50,7 +50,6 @@ #include "fib_lookup.h" -static DEFINE_SPINLOCK(fib_info_lock); static struct hlist_head *fib_info_hash; static struct hlist_head *fib_info_laddrhash; static unsigned int fib_info_hash_size; @@ -260,12 +259,11 @@ EXPORT_SYMBOL_GPL(free_fib_info); void fib_release_info(struct fib_info *fi) { - spin_lock_bh(&fib_info_lock); + ASSERT_RTNL(); if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); - /* Paired with READ_ONCE() in fib_create_info(). */ - WRITE_ONCE(fib_info_cnt, fib_info_cnt - 1); + fib_info_cnt--; if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); @@ -282,7 +280,6 @@ void fib_release_info(struct fib_info *fi) WRITE_ONCE(fi->fib_dead, 1); fib_info_put(fi); } - spin_unlock_bh(&fib_info_lock); } static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) @@ -1266,7 +1263,7 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, unsigned int old_size = fib_info_hash_size; unsigned int i; - spin_lock_bh(&fib_info_lock); + ASSERT_RTNL(); old_info_hash = fib_info_hash; old_laddrhash = fib_info_laddrhash; fib_info_hash_size = new_size; @@ -1303,8 +1300,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, } } - spin_unlock_bh(&fib_info_lock); - kvfree(old_info_hash); kvfree(old_laddrhash); } @@ -1380,6 +1375,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; + ASSERT_RTNL(); if (cfg->fc_type > RTN_MAX) goto err_inval; @@ -1422,8 +1418,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, err = -ENOBUFS; - /* Paired with WRITE_ONCE() in fib_release_info() */ - if (READ_ONCE(fib_info_cnt) >= fib_info_hash_size) { + if (fib_info_cnt >= fib_info_hash_size) { unsigned int new_size = fib_info_hash_size << 1; struct hlist_head *new_info_hash; struct hlist_head *new_laddrhash; @@ -1582,7 +1577,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); - spin_lock_bh(&fib_info_lock); + fib_info_cnt++; hlist_add_head(&fi->fib_hash, &fib_info_hash[fib_info_hashfn(fi)]); @@ -1604,7 +1599,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, hlist_add_head_rcu(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) } - spin_unlock_bh(&fib_info_lock); return fi; err_inval: From a3f5f4c2f9b6bc2aa6f5a3e8e23b7519e4f2e3e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2024 13:47:20 +0000 Subject: [PATCH 4/4] ipv4: remove fib_info_devhash[] Upcoming per-netns RTNL conversion needs to get rid of shared hash tables. fib_info_devhash[] is one of them. It is unclear why we used a hash table, because a single hlist_head per net device was cheaper and scalable. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241004134720.579244-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 3 ++ net/ipv4/fib_semantics.c | 35 ++++++++----------- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 49f03cb78c6e..556711c4d3cf 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -83,6 +83,7 @@ unsigned_int allmulti bool uc_promisc unsigned_char nested_level struct_in_device* ip_ptr read_mostly read_mostly __in_dev_get +struct hlist_head fib_nh_head struct_inet6_dev* ip6_ptr read_mostly read_mostly __in6_dev_get struct_vlan_info* vlan_info struct_dsa_port* dsa_ptr diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 49a7e7db0883..3baf8e539b6f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2211,6 +2211,9 @@ struct net_device { /* Protocol-specific pointers */ struct in_device __rcu *ip_ptr; + /** @fib_nh_head: nexthops associated with this netdev */ + struct hlist_head fib_nh_head; + #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index ece779bfb8f6..d2cee5c314f5 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -56,10 +56,6 @@ static unsigned int fib_info_hash_size; static unsigned int fib_info_hash_bits; static unsigned int fib_info_cnt; -#define DEVINDEX_HASHBITS 8 -#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) -static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; - /* for_nexthops and change_nexthops only used when nexthop object * is not set in a fib_info. The logic within can reference fib_nh. */ @@ -319,12 +315,9 @@ static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) return 0; } -static struct hlist_head * -fib_info_devhash_bucket(const struct net_device *dev) +static struct hlist_head *fib_nh_head(struct net_device *dev) { - u32 val = net_hash_mix(dev_net(dev)) ^ dev->ifindex; - - return &fib_info_devhash[hash_32(val, DEVINDEX_HASHBITS)]; + return &dev->fib_nh_head; } static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, @@ -435,11 +428,11 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev) struct hlist_head *head; struct fib_nh *nh; - head = fib_info_devhash_bucket(dev); + head = fib_nh_head(dev); hlist_for_each_entry_rcu(nh, head, nh_hash) { - if (nh->fib_nh_dev == dev && - nh->fib_nh_gw4 == gw && + DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); + if (nh->fib_nh_gw4 == gw && !(nh->fib_nh_flags & RTNH_F_DEAD)) { return 0; } @@ -1595,7 +1588,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, if (!nexthop_nh->fib_nh_dev) continue; - head = fib_info_devhash_bucket(nexthop_nh->fib_nh_dev); + head = fib_nh_head(nexthop_nh->fib_nh_dev); hlist_add_head_rcu(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) } @@ -1948,12 +1941,12 @@ void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) { - struct hlist_head *head = fib_info_devhash_bucket(dev); + struct hlist_head *head = fib_nh_head(dev); struct fib_nh *nh; hlist_for_each_entry(nh, head, nh_hash) { - if (nh->fib_nh_dev == dev) - fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); + DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); + fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); } } @@ -1967,7 +1960,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) */ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { - struct hlist_head *head = fib_info_devhash_bucket(dev); + struct hlist_head *head = fib_nh_head(dev); struct fib_info *prev_fi = NULL; int scope = RT_SCOPE_NOWHERE; struct fib_nh *nh; @@ -1981,7 +1974,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) int dead; BUG_ON(!fi->fib_nhs); - if (nh->fib_nh_dev != dev || fi == prev_fi) + DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); + if (fi == prev_fi) continue; prev_fi = fi; dead = 0; @@ -2131,7 +2125,7 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) } prev_fi = NULL; - head = fib_info_devhash_bucket(dev); + head = fib_nh_head(dev); ret = 0; hlist_for_each_entry(nh, head, nh_hash) { @@ -2139,7 +2133,8 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) int alive; BUG_ON(!fi->fib_nhs); - if (nh->fib_nh_dev != dev || fi == prev_fi) + DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); + if (fi == prev_fi) continue; prev_fi = fi;