From caf0a753a8eb7ca2b035e199b71a3dabb853a18a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:06 +0000 Subject: [PATCH 01/15] neighbour: Make neigh_valid_get_req() return ndmsg. neigh_get() passes 4 local variable pointers to neigh_valid_get_req(). If it returns a pointer of struct ndmsg, we do not need to pass two of them. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d1de7f292eea..e888827c4b7d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2910,10 +2910,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) return err; } -static int neigh_valid_get_req(const struct nlmsghdr *nlh, - struct neigh_table **tbl, - void **dst, int *dev_idx, u8 *ndm_flags, - struct netlink_ext_ack *extack) +static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, + struct neigh_table **tbl, void **dst, + struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; @@ -2922,31 +2921,29 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh, ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) - return err; + return ERR_PTR(err); - *ndm_flags = ndm->ndm_flags; - *dev_idx = ndm->ndm_ifindex; *tbl = neigh_find_table(ndm->ndm_family); - if (*tbl == NULL) { + if (!*tbl) { NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); - return -EAFNOSUPPORT; + return ERR_PTR(-EAFNOSUPPORT); } for (i = 0; i <= NDA_MAX; ++i) { @@ -2957,17 +2954,17 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh, case NDA_DST: if (nla_len(tb[i]) != (int)(*tbl)->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } *dst = nla_data(tb[i]); break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } } - return 0; + return ndm; } static inline size_t neigh_nlmsg_size(void) @@ -3038,18 +3035,16 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct net_device *dev = NULL; struct neigh_table *tbl = NULL; struct neighbour *neigh; + struct ndmsg *ndm; void *dst = NULL; - u8 ndm_flags = 0; - int dev_idx = 0; int err; - err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags, - extack); - if (err < 0) - return err; + ndm = neigh_valid_get_req(nlh, &tbl, &dst, extack); + if (IS_ERR(ndm)) + return PTR_ERR(ndm); - if (dev_idx) { - dev = __dev_get_by_index(net, dev_idx); + if (ndm->ndm_ifindex) { + dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); return -ENODEV; @@ -3061,7 +3056,7 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return -EINVAL; } - if (ndm_flags & NTF_PROXY) { + if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; pn = pneigh_lookup(tbl, net, dst, dev, 0); From f5046fbc1b6d8c5168d47a617f368f9d4a025e34 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:07 +0000 Subject: [PATCH 02/15] neighbour: Move two validations from neigh_get() to neigh_valid_get_req(). We will remove RTNL for neigh_get() and run it under RCU instead. neigh_get() returns -EINVAL in the following cases: * NDA_DST is not specified * Both ndm->ndm_ifindex and NTF_PROXY are not specified These validations do not require RCU. Let's move them to neigh_valid_get_req(). While at it, the extack string for the first case is replaced with NL_SET_ERR_ATTR_MISS(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e888827c4b7d..7b63f47cd61a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2935,6 +2935,11 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, return ERR_PTR(-EINVAL); } + if (!(ndm->ndm_flags & NTF_PROXY) && !ndm->ndm_ifindex) { + NL_SET_ERR_MSG(extack, "No device specified"); + return ERR_PTR(-EINVAL); + } + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) @@ -2947,11 +2952,13 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, } for (i = 0; i <= NDA_MAX; ++i) { - if (!tb[i]) - continue; - switch (i) { case NDA_DST: + if (!tb[i]) { + NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST); + return ERR_PTR(-EINVAL); + } + if (nla_len(tb[i]) != (int)(*tbl)->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); return ERR_PTR(-EINVAL); @@ -2959,6 +2966,9 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, *dst = nla_data(tb[i]); break; default: + if (!tb[i]) + continue; + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); return ERR_PTR(-EINVAL); } @@ -3051,11 +3061,6 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, } } - if (!dst) { - NL_SET_ERR_MSG(extack, "Network address not specified"); - return -EINVAL; - } - if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; @@ -3068,11 +3073,6 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, nlh->nlmsg_seq, tbl); } - if (!dev) { - NL_SET_ERR_MSG(extack, "No device specified"); - return -EINVAL; - } - neigh = neigh_lookup(tbl, dst, dev); if (!neigh) { NL_SET_ERR_MSG(extack, "Neighbour entry not found"); From 3dfe0b57dcda070d9f1ed2bfb3a9bf0ec8632e08 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:08 +0000 Subject: [PATCH 03/15] neighbour: Allocate skb in neigh_get(). We will remove RTNL for neigh_get() and run it under RCU instead. neigh_get_reply() and pneigh_get_reply() allocate skb with GFP_KERNEL. Let's move the allocation before __dev_get_by_index() in neigh_get(). Now, neigh_get_reply() and pneigh_get_reply() are inlined and rtnl_unicast() is factorised. We will convert pneigh_lookup() to __pneigh_lookup() later. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-4-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 90 ++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 57 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 7b63f47cd61a..761593ea8c91 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2988,27 +2988,6 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(1); /* NDA_PROTOCOL */ } -static int neigh_get_reply(struct net *net, struct neighbour *neigh, - u32 pid, u32 seq) -{ - struct sk_buff *skb; - int err = 0; - - skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); - if (err) { - kfree_skb(skb); - goto errout; - } - - err = rtnl_unicast(skb, net, pid); -errout: - return err; -} - static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) @@ -3017,34 +2996,16 @@ static inline size_t pneigh_nlmsg_size(void) + nla_total_size(1); /* NDA_PROTOCOL */ } -static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh, - u32 pid, u32 seq, struct neigh_table *tbl) -{ - struct sk_buff *skb; - int err = 0; - - skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl); - if (err) { - kfree_skb(skb); - goto errout; - } - - err = rtnl_unicast(skb, net, pid); -errout: - return err; -} - static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); + u32 pid = NETLINK_CB(in_skb).portid; struct net_device *dev = NULL; struct neigh_table *tbl = NULL; + u32 seq = nlh->nlmsg_seq; struct neighbour *neigh; + struct sk_buff *skb; struct ndmsg *ndm; void *dst = NULL; int err; @@ -3053,11 +3014,19 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (IS_ERR(ndm)) return PTR_ERR(ndm); + if (ndm->ndm_flags & NTF_PROXY) + skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); + else + skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); - return -ENODEV; + err = -ENODEV; + goto err_free_skb; } } @@ -3067,23 +3036,30 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, pn = pneigh_lookup(tbl, net, dst, dev, 0); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); - return -ENOENT; + err = -ENOENT; + goto err_free_skb; } - return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, tbl); + + err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl); + if (err) + goto err_free_skb; + } else { + neigh = neigh_lookup(tbl, dst, dev); + if (!neigh) { + NL_SET_ERR_MSG(extack, "Neighbour entry not found"); + err = -ENOENT; + goto err_free_skb; + } + + err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); + neigh_release(neigh); + if (err) + goto err_free_skb; } - neigh = neigh_lookup(tbl, dst, dev); - if (!neigh) { - NL_SET_ERR_MSG(extack, "Neighbour entry not found"); - return -ENOENT; - } - - err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq); - - neigh_release(neigh); - + return rtnl_unicast(skb, net, pid); +err_free_skb: + kfree_skb(skb); return err; } From 0e5ac19c78654abbf43dc4ffdae290c8cb81c59c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:09 +0000 Subject: [PATCH 04/15] neighbour: Move neigh_find_table() to neigh_get(). neigh_valid_get_req() calls neigh_find_table() to fetch neigh_tables[]. neigh_find_table() uses rcu_dereference_rtnl(), but RTNL actually does not protect it at all; neigh_table_clear() can be called without RTNL and only waits for RCU readers by synchronize_rcu(). Fortunately, there is no bug because IPv4 is built-in, IPv6 cannot be unloaded, and DECNET was removed. To fetch neigh_tables[] by rcu_dereference() later, let's move neigh_find_table() from neigh_valid_get_req() to neigh_get(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 761593ea8c91..ffb8d80328ed 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2911,10 +2911,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) } static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, - struct neigh_table **tbl, void **dst, + struct nlattr **tb, struct netlink_ext_ack *extack) { - struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; @@ -2945,12 +2944,6 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, if (err < 0) return ERR_PTR(err); - *tbl = neigh_find_table(ndm->ndm_family); - if (!*tbl) { - NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); - return ERR_PTR(-EAFNOSUPPORT); - } - for (i = 0; i <= NDA_MAX; ++i) { switch (i) { case NDA_DST: @@ -2958,12 +2951,6 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST); return ERR_PTR(-EINVAL); } - - if (nla_len(tb[i]) != (int)(*tbl)->key_len) { - NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); - return ERR_PTR(-EINVAL); - } - *dst = nla_data(tb[i]); break; default: if (!tb[i]) @@ -3001,16 +2988,17 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, { struct net *net = sock_net(in_skb->sk); u32 pid = NETLINK_CB(in_skb).portid; + struct nlattr *tb[NDA_MAX + 1]; struct net_device *dev = NULL; - struct neigh_table *tbl = NULL; u32 seq = nlh->nlmsg_seq; + struct neigh_table *tbl; struct neighbour *neigh; struct sk_buff *skb; struct ndmsg *ndm; - void *dst = NULL; + void *dst; int err; - ndm = neigh_valid_get_req(nlh, &tbl, &dst, extack); + ndm = neigh_valid_get_req(nlh, tb, extack); if (IS_ERR(ndm)) return PTR_ERR(ndm); @@ -3021,6 +3009,21 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!skb) return -ENOBUFS; + tbl = neigh_find_table(ndm->ndm_family); + if (!tbl) { + NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); + err = -EAFNOSUPPORT; + goto err_free_skb; + } + + if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); + err = -EINVAL; + goto err_free_skb; + } + + dst = nla_data(tb[NDA_DST]); + if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (!dev) { From e804bd83c1fd7e1f03899c948812ebc207ac5a7e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:10 +0000 Subject: [PATCH 05/15] neighbour: Split pneigh_lookup(). pneigh_lookup() has ASSERT_RTNL() in the middle of the function, which is confusing. When called with the last argument, creat, 0, pneigh_lookup() literally looks up a proxy neighbour entry. This is the case of the reader path as the fast path and RTM_GETNEIGH. pneigh_lookup(), however, creates a pneigh_entry when called with creat 1 from RTM_NEWNEIGH and SIOCSARP, which require RTNL. Let's split pneigh_lookup() into two functions. We will convert all the reader paths to RCU, and read_lock_bh(&tbl->lock) in the new pneigh_lookup() will be dropped. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-6-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 5 +++-- net/core/neighbour.c | 41 ++++++++++++++++++++++++++++++----------- net/ipv4/arp.c | 4 ++-- net/ipv6/ip6_output.c | 2 +- net/ipv6/ndisc.c | 2 +- 5 files changed, 37 insertions(+), 17 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 7e865b14749d..7f3d57da5689 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -376,10 +376,11 @@ unsigned long neigh_rand_reach_time(unsigned long base); void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev, - int creat); + const void *key, struct net_device *dev); struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); +struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net, + const void *key, struct net_device *dev); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ffb8d80328ed..d0e303360b2c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -746,24 +747,44 @@ struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, } EXPORT_SYMBOL_GPL(__pneigh_lookup); -struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, - struct net_device *dev, int creat) +struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev) { struct pneigh_entry *n; - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + unsigned int key_len; + u32 hash_val; + + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); read_lock_bh(&tbl->lock); n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); - if (n || !creat) - goto out; + return n; +} +EXPORT_IPV6_MOD(pneigh_lookup); + +struct pneigh_entry *pneigh_create(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev) +{ + struct pneigh_entry *n; + unsigned int key_len = tbl->key_len; + u32 hash_val = pneigh_hash(pkey, key_len); ASSERT_RTNL(); + read_lock_bh(&tbl->lock); + n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + net, pkey, key_len, dev); + read_unlock_bh(&tbl->lock); + + if (n) + goto out; + n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) goto out; @@ -787,8 +808,6 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, out: return n; } -EXPORT_SYMBOL(pneigh_lookup); - int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) @@ -2007,7 +2026,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, } err = -ENOBUFS; - pn = pneigh_lookup(tbl, net, dst, dev, 1); + pn = pneigh_create(tbl, net, dst, dev); if (pn) { pn->flags = ndm_flags; pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT); @@ -3036,7 +3055,7 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; - pn = pneigh_lookup(tbl, net, dst, dev, 0); + pn = pneigh_lookup(tbl, net, dst, dev); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); err = -ENOENT; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c0440d61cf2f..d93b5735b0ba 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -864,7 +864,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || (rt->dst.dev != dev && - pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { + pneigh_lookup(&arp_tbl, net, &tip, dev)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -1089,7 +1089,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1)) + if (!pneigh_create(&arp_tbl, net, &ip, dev)) return -ENOBUFS; return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index fcc20c7250eb..0412f8544695 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -563,7 +563,7 @@ int ip6_forward(struct sk_buff *skb) /* XXX: idev->cnf.proxy_ndp? */ if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && - pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { + pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) { /* It's tempting to decrease the hop limit diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d4c5876e1771..a3ac26c1df6d 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1100,7 +1100,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && READ_ONCE(net->ipv6.devconf_all->forwarding) && READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && - pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { + pneigh_lookup(&nd_tbl, net, &msg->target, dev)) { /* XXX: idev->cnf.proxy_ndp */ goto out; } From d63382aea70aa4ecb516126e00930bc8ab5e55ef Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:11 +0000 Subject: [PATCH 06/15] neighbour: Annotate neigh_table.phash_buckets and pneigh_entry.next with __rcu. The next patch will free pneigh_entry with call_rcu(). Then, we need to annotate neigh_table.phash_buckets[] and pneigh_entry.next with __rcu. To make the next patch cleaner, let's annotate the fields in advance. Currently, all accesses to the fields are under the neigh table lock, so rcu_dereference_protected() is used with 1 for now, but most of them (except in pneigh_delete() and pneigh_ifdown_and_unlock()) will be replaced with rcu_dereference() and rcu_dereference_check(). Note that pneigh_ifdown_and_unlock() changes pneigh_entry.next to a local list, which is illegal because the RCU iterator could be moved to another list. This part will be fixed in the next patch. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 4 ++-- net/core/neighbour.c | 52 ++++++++++++++++++++++++----------------- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 7f3d57da5689..1ddc44a04200 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -176,7 +176,7 @@ struct neigh_ops { }; struct pneigh_entry { - struct pneigh_entry *next; + struct pneigh_entry __rcu *next; possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; @@ -236,7 +236,7 @@ struct neigh_table { unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; - struct pneigh_entry **phash_buckets; + struct pneigh_entry __rcu **phash_buckets; }; static inline int neigh_parms_family(struct neigh_parms *p) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d0e303360b2c..7fcb0a8d655f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -731,7 +731,8 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, net_eq(pneigh_net(n), net) && (n->dev == dev || !n->dev)) return n; - n = n->next; + + n = rcu_dereference_protected(n->next, 1); } return NULL; } @@ -742,7 +743,7 @@ struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); - return __pneigh_lookup_1(tbl->phash_buckets[hash_val], + return __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), net, pkey, key_len, dev); } EXPORT_SYMBOL_GPL(__pneigh_lookup); @@ -759,7 +760,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, hash_val = pneigh_hash(pkey, key_len); read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + n = __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); @@ -778,7 +779,7 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, ASSERT_RTNL(); read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + n = __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); @@ -803,7 +804,7 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; - tbl->phash_buckets[hash_val] = n; + rcu_assign_pointer(tbl->phash_buckets[hash_val], n); write_unlock_bh(&tbl->lock); out: return n; @@ -812,16 +813,20 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { - struct pneigh_entry *n, **np; - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + struct pneigh_entry *n, __rcu **np; + unsigned int key_len; + u32 hash_val; + + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); - for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; + for (np = &tbl->phash_buckets[hash_val]; + (n = rcu_dereference_protected(*np, 1)) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { - *np = n->next; + rcu_assign_pointer(*np, n->next); write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); @@ -838,17 +843,17 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { - struct pneigh_entry *n, **np, *freelist = NULL; + struct pneigh_entry *n, __rcu **np, *freelist = NULL; u32 h; for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, 1)) != NULL) { if (skip_perm && n->permanent) goto skip; if (!dev || n->dev == dev) { - *np = n->next; - n->next = freelist; + rcu_assign_pointer(*np, n->next); + rcu_assign_pointer(n->next, freelist); freelist = n; continue; } @@ -858,7 +863,7 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, } write_unlock_bh(&tbl->lock); while ((n = freelist)) { - freelist = n->next; + freelist = rcu_dereference_protected(n->next, 1); n->next = NULL; if (tbl->pdestructor) tbl->pdestructor(n); @@ -2794,7 +2799,9 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; - for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { + for (n = rcu_dereference_protected(tbl->phash_buckets[h], 1), idx = 0; + n; + n = rcu_dereference_protected(n->next, 1)) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || @@ -3288,9 +3295,10 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { - pn = tbl->phash_buckets[bucket]; + pn = rcu_dereference_protected(tbl->phash_buckets[bucket], 1); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference_protected(pn->next, 1); if (pn) break; } @@ -3308,15 +3316,17 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct neigh_table *tbl = state->tbl; do { - pn = pn->next; + pn = rcu_dereference_protected(pn->next, 1); } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; - pn = tbl->phash_buckets[state->bucket]; + + pn = rcu_dereference_protected(tbl->phash_buckets[state->bucket], 1); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference_protected(pn->next, 1); if (pn) break; } From d539d8fbd8fcf64a1492c51f5ee99aaa8a8dc9ab Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:12 +0000 Subject: [PATCH 07/15] neighbour: Free pneigh_entry after RCU grace period. We will convert RTM_GETNEIGH to RCU. neigh_get() looks up pneigh_entry by pneigh_lookup() and passes it to pneigh_fill_info(). Then, we must ensure that the entry is alive till pneigh_fill_info() completes, but read_lock_bh(&tbl->lock) in pneigh_lookup() does not guarantee that. Also, we will convert all readers of tbl->phash_buckets[] to RCU. Let's use call_rcu() to free pneigh_entry and update phash_buckets[] and ->next by rcu_assign_pointer(). pneigh_ifdown_and_unlock() uses list_head to avoid overwriting ->next and moving RCU iterators to another list. pndisc_destructor() (only IPv6 ndisc uses this) uses a mutex, so it is not delayed to call_rcu(), where we cannot sleep. This is fine because the mcast code works with RCU and ipv6_dev_mc_dec() frees mcast objects after RCU grace period. While at it, we change the return type of pneigh_ifdown_and_unlock() to void. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-8-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 4 ++++ net/core/neighbour.c | 45 +++++++++++++++++++++++++---------------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 1ddc44a04200..6d7f9aa53a7a 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -180,6 +180,10 @@ struct pneigh_entry { possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; + union { + struct list_head free_node; + struct rcu_head rcu; + }; u32 flags; u8 protocol; bool permanent; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 7fcb0a8d655f..fa2e60a479ef 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -54,9 +54,9 @@ static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm); +static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, + struct net_device *dev, + bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; @@ -810,6 +810,14 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, return n; } +static void pneigh_destroy(struct rcu_head *rcu) +{ + struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu); + + netdev_put(n->dev, &n->dev_tracker); + kfree(n); +} + int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { @@ -828,10 +836,11 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, net_eq(pneigh_net(n), net)) { rcu_assign_pointer(*np, n->next); write_unlock_bh(&tbl->lock); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); return 0; } } @@ -839,11 +848,12 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, return -ENOENT; } -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm) +static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, + struct net_device *dev, + bool skip_perm) { - struct pneigh_entry *n, __rcu **np, *freelist = NULL; + struct pneigh_entry *n, __rcu **np; + LIST_HEAD(head); u32 h; for (h = 0; h <= PNEIGH_HASHMASK; h++) { @@ -853,24 +863,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, goto skip; if (!dev || n->dev == dev) { rcu_assign_pointer(*np, n->next); - rcu_assign_pointer(n->next, freelist); - freelist = n; + list_add(&n->free_node, &head); continue; } skip: np = &n->next; } } + write_unlock_bh(&tbl->lock); - while ((n = freelist)) { - freelist = rcu_dereference_protected(n->next, 1); - n->next = NULL; + + while (!list_empty(&head)) { + n = list_first_entry(&head, typeof(*n), free_node); + list_del(&n->free_node); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); } - return -ENOENT; } static inline void neigh_parms_put(struct neigh_parms *parms) From cc03492c7b92cb4414bd72a88f4d88ceb78362f9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:13 +0000 Subject: [PATCH 08/15] neighbour: Annotate access to struct pneigh_entry.{flags,protocol}. We will convert pneigh readers to RCU, and its flags and protocol will be read locklessly. Let's annotate the access to the two fields. Note that all access to pn->permanent is under RTNL (neigh_add() and pneigh_ifdown_and_unlock()), so WRITE_ONCE() and READ_ONCE() are not needed. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-9-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index fa2e60a479ef..59dfdecddff3 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2044,10 +2044,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, err = -ENOBUFS; pn = pneigh_create(tbl, net, dst, dev); if (pn) { - pn->flags = ndm_flags; + WRITE_ONCE(pn->flags, ndm_flags); pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT); if (protocol) - pn->protocol = protocol; + WRITE_ONCE(pn->protocol, protocol); err = 0; } goto out; @@ -2678,13 +2678,15 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; + u8 protocol; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; - neigh_flags_ext = pn->flags >> NTF_EXT_SHIFT; - neigh_flags = pn->flags & NTF_OLD_MASK; + neigh_flags = READ_ONCE(pn->flags); + neigh_flags_ext = neigh_flags >> NTF_EXT_SHIFT; + neigh_flags &= NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; @@ -2698,7 +2700,8 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; - if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) + protocol = READ_ONCE(pn->protocol); + if (protocol && nla_put_u8(skb, NDA_PROTOCOL, protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; From ed6e380d2d419a3e8ca73de7b4c7ccb522835f1e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:14 +0000 Subject: [PATCH 09/15] neighbour: Convert RTM_GETNEIGH to RCU. Only __dev_get_by_index() is the RTNL dependant in neigh_get(). Let's replace it with dev_get_by_index_rcu() and convert RTM_GETNEIGH to RCU. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-10-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 59dfdecddff3..50ab95ecbcd6 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3049,27 +3049,29 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!skb) return -ENOBUFS; + rcu_read_lock(); + tbl = neigh_find_table(ndm->ndm_family); if (!tbl) { NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); err = -EAFNOSUPPORT; - goto err_free_skb; + goto err_unlock; } if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); err = -EINVAL; - goto err_free_skb; + goto err_unlock; } dst = nla_data(tb[NDA_DST]); if (ndm->ndm_ifindex) { - dev = __dev_get_by_index(net, ndm->ndm_ifindex); + dev = dev_get_by_index_rcu(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); err = -ENODEV; - goto err_free_skb; + goto err_unlock; } } @@ -3080,28 +3082,31 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); err = -ENOENT; - goto err_free_skb; + goto err_unlock; } err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl); if (err) - goto err_free_skb; + goto err_unlock; } else { neigh = neigh_lookup(tbl, dst, dev); if (!neigh) { NL_SET_ERR_MSG(extack, "Neighbour entry not found"); err = -ENOENT; - goto err_free_skb; + goto err_unlock; } err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); neigh_release(neigh); if (err) - goto err_free_skb; + goto err_unlock; } + rcu_read_unlock(); + return rtnl_unicast(skb, net, pid); -err_free_skb: +err_unlock: + rcu_read_unlock(); kfree_skb(skb); return err; } @@ -3904,7 +3909,7 @@ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, - .flags = RTNL_FLAG_DUMP_UNLOCKED}, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, }; From 32d5eaabf186112eb2023aacb860c47ff7e7fdbf Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:15 +0000 Subject: [PATCH 10/15] neighbour: Drop read_lock_bh(&tbl->lock) in pneigh_dump_table(). Now pneigh_entry is guaranteed to be alive during the RCU critical section even without holding tbl->lock. Let's drop read_lock_bh(&tbl->lock) and use rcu_dereference() to iterate tbl->phash_buckets[] in pneigh_dump_table() Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-11-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 50ab95ecbcd6..3ef797212618 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2808,14 +2808,12 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; - read_lock_bh(&tbl->lock); - for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; - for (n = rcu_dereference_protected(tbl->phash_buckets[h], 1), idx = 0; + for (n = rcu_dereference(tbl->phash_buckets[h]), idx = 0; n; - n = rcu_dereference_protected(n->next, 1)) { + n = rcu_dereference(n->next)) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || @@ -2824,16 +2822,13 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags, tbl); - if (err < 0) { - read_unlock_bh(&tbl->lock); + if (err < 0) goto out; - } next: idx++; } } - read_unlock_bh(&tbl->lock); out: cb->args[3] = h; cb->args[4] = idx; From b9c89fa128fa41e56300434dfca1138459b29384 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:16 +0000 Subject: [PATCH 11/15] neighbour: Use rcu_dereference() in pneigh_get_{first,next}(). Now pneigh_entry is guaranteed to be alive during the RCU critical section even without holding tbl->lock. Let's use rcu_dereference() in pneigh_get_{first,next}(). Note that neigh_seq_start() still holds tbl->lock for the normal neighbour entry. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-12-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3ef797212618..b76ff416b9a7 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3309,10 +3309,10 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { - pn = rcu_dereference_protected(tbl->phash_buckets[bucket], 1); + pn = rcu_dereference(tbl->phash_buckets[bucket]); while (pn && !net_eq(pneigh_net(pn), net)) - pn = rcu_dereference_protected(pn->next, 1); + pn = rcu_dereference(pn->next); if (pn) break; } @@ -3330,17 +3330,17 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct neigh_table *tbl = state->tbl; do { - pn = rcu_dereference_protected(pn->next, 1); + pn = rcu_dereference(pn->next); } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; - pn = rcu_dereference_protected(tbl->phash_buckets[state->bucket], 1); + pn = rcu_dereference(tbl->phash_buckets[state->bucket]); while (pn && !net_eq(pneigh_net(pn), net)) - pn = rcu_dereference_protected(pn->next, 1); + pn = rcu_dereference(pn->next); if (pn) break; } From dd103c9a53752d3754a3182ec8dd97885680cfe2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:17 +0000 Subject: [PATCH 12/15] neighbour: Remove __pneigh_lookup(). __pneigh_lookup() is the lockless version of pneigh_lookup(), but its only caller pndisc_is_router() holds the table lock and reads pneigh_netry.flags. This is because accessing pneigh_entry after pneigh_lookup() was illegal unless the caller holds RTNL or the table lock. Now, pneigh_entry is guaranteed to be alive during the RCU critical section. Let's call pneigh_lookup() and use READ_ONCE() for n->flags in pndisc_is_router() and remove __pneigh_lookup(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-13-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 2 -- net/core/neighbour.c | 11 ----------- net/ipv6/ndisc.c | 6 ++---- 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 6d7f9aa53a7a..f8c7261cd4eb 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -381,8 +381,6 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev); struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b76ff416b9a7..e7bd8111f97f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -737,17 +737,6 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, return NULL; } -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, struct net_device *dev) -{ - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); - - return __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), - net, pkey, key_len, dev); -} -EXPORT_SYMBOL_GPL(__pneigh_lookup); - struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a3ac26c1df6d..7d5abb3158ec 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -768,11 +768,9 @@ static int pndisc_is_router(const void *pkey, struct pneigh_entry *n; int ret = -1; - read_lock_bh(&nd_tbl.lock); - n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); + n = pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); if (n) - ret = !!(n->flags & NTF_ROUTER); - read_unlock_bh(&nd_tbl.lock); + ret = !!(READ_ONCE(n->flags) & NTF_ROUTER); return ret; } From b8b7ed1ea83a9b2b6e697c10c4b24b9ea0003e19 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:18 +0000 Subject: [PATCH 13/15] neighbour: Drop read_lock_bh(&tbl->lock) in pneigh_lookup(). Now, all callers of pneigh_lookup() are under RCU, and the read lock there is no longer needed. Let's drop the lock, inline __pneigh_lookup_1() to pneigh_lookup(), and call it from pneigh_create(). The next patch will remove tbl->lock from pneigh_create(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-14-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 43 ++++++++++++++++--------------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e7bd8111f97f..38f0067068c5 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -720,23 +720,6 @@ static u32 pneigh_hash(const void *pkey, unsigned int key_len) return hash_val; } -static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, - struct net *net, - const void *pkey, - unsigned int key_len, - struct net_device *dev) -{ - while (n) { - if (!memcmp(n->key, pkey, key_len) && - net_eq(pneigh_net(n), net) && - (n->dev == dev || !n->dev)) - return n; - - n = rcu_dereference_protected(n->next, 1); - } - return NULL; -} - struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) @@ -747,13 +730,19 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); + n = rcu_dereference_check(tbl->phash_buckets[hash_val], + lockdep_is_held(&tbl->lock)); - read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), - net, pkey, key_len, dev); - read_unlock_bh(&tbl->lock); + while (n) { + if (!memcmp(n->key, pkey, key_len) && + net_eq(pneigh_net(n), net) && + (n->dev == dev || !n->dev)) + return n; - return n; + n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->lock)); + } + + return NULL; } EXPORT_IPV6_MOD(pneigh_lookup); @@ -762,19 +751,18 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net_device *dev) { struct pneigh_entry *n; - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + unsigned int key_len; + u32 hash_val; ASSERT_RTNL(); read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), - net, pkey, key_len, dev); + n = pneigh_lookup(tbl, net, pkey, dev); read_unlock_bh(&tbl->lock); - if (n) goto out; + key_len = tbl->key_len; n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) goto out; @@ -791,6 +779,7 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, goto out; } + hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); From 13a936bb99fb6385dc8620d24d7111e514448371 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:19 +0000 Subject: [PATCH 14/15] neighbour: Protect tbl->phash_buckets[] with a dedicated mutex. tbl->phash_buckets[] is only modified in the slow path by pneigh_create() and pneigh_delete() under the table lock. Both of them are called under RTNL, so no extra lock is needed, but we will remove RTNL from the paths. pneigh_create() looks up a pneigh_entry, and this part can be lockless, but it would complicate the logic like 1. lookup 2. allocate pengih_entry for GFP_KERNEL 3. lookup again but under lock 4. if found, return it after freeing the allocated memory 5. else, return the new one Instead, let's add a per-table mutex and run lookup and allocation under it. Note that updating pneigh_entry part in neigh_add() is still protected by RTNL and will be moved to pneigh_create() in the next patch. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-15-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 1 + net/core/neighbour.c | 39 +++++++++++++++++++++------------------ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index f8c7261cd4eb..f333f9ebc425 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -240,6 +240,7 @@ struct neigh_table { unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; + struct mutex phash_lock; struct pneigh_entry __rcu **phash_buckets; }; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 38f0067068c5..d312b6323ff2 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -54,9 +54,8 @@ static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); -static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm); +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; @@ -437,7 +436,9 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, { write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); - pneigh_ifdown_and_unlock(tbl, dev, skip_perm); + write_unlock_bh(&tbl->lock); + + pneigh_ifdown(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) @@ -731,7 +732,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); n = rcu_dereference_check(tbl->phash_buckets[hash_val], - lockdep_is_held(&tbl->lock)); + lockdep_is_held(&tbl->phash_lock)); while (n) { if (!memcmp(n->key, pkey, key_len) && @@ -739,7 +740,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, (n->dev == dev || !n->dev)) return n; - n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->lock)); + n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock)); } return NULL; @@ -754,11 +755,9 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, unsigned int key_len; u32 hash_val; - ASSERT_RTNL(); + mutex_lock(&tbl->phash_lock); - read_lock_bh(&tbl->lock); n = pneigh_lookup(tbl, net, pkey, dev); - read_unlock_bh(&tbl->lock); if (n) goto out; @@ -780,11 +779,10 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, } hash_val = pneigh_hash(pkey, key_len); - write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); - write_unlock_bh(&tbl->lock); out: + mutex_unlock(&tbl->phash_lock); return n; } @@ -806,14 +804,16 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); - write_lock_bh(&tbl->lock); + mutex_lock(&tbl->phash_lock); + for (np = &tbl->phash_buckets[hash_val]; (n = rcu_dereference_protected(*np, 1)) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { rcu_assign_pointer(*np, n->next); - write_unlock_bh(&tbl->lock); + + mutex_unlock(&tbl->phash_lock); if (tbl->pdestructor) tbl->pdestructor(n); @@ -822,18 +822,20 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, return 0; } } - write_unlock_bh(&tbl->lock); + + mutex_unlock(&tbl->phash_lock); return -ENOENT; } -static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm) +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { struct pneigh_entry *n, __rcu **np; LIST_HEAD(head); u32 h; + mutex_lock(&tbl->phash_lock); + for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; while ((n = rcu_dereference_protected(*np, 1)) != NULL) { @@ -849,7 +851,7 @@ static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, } } - write_unlock_bh(&tbl->lock); + mutex_unlock(&tbl->phash_lock); while (!list_empty(&head)) { n = list_first_entry(&head, typeof(*n), free_node); @@ -1796,6 +1798,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); + mutex_init(&tbl->phash_lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, From dc2a27e524ac13e7a599bc693934ed81f868dc2d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:20 +0000 Subject: [PATCH 15/15] neighbour: Update pneigh_entry in pneigh_create(). neigh_add() updates pneigh_entry() found or created by pneigh_create(). This update is serialised by RTNL, but we will remove it. Let's move the update part to pneigh_create() and make it return errno instead of a pointer of pneigh_entry. Now, the pneigh code is RTNL free. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-16-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 5 +++-- net/core/neighbour.c | 34 ++++++++++++++++------------------ net/ipv4/arp.c | 4 +--- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index f333f9ebc425..4a30bd458c5a 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -382,8 +382,9 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); -struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev); +int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, + struct net_device *dev, u32 flags, u8 protocol, + bool permanent); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d312b6323ff2..4316ca3d9872 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -747,24 +747,27 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, } EXPORT_IPV6_MOD(pneigh_lookup); -struct pneigh_entry *pneigh_create(struct neigh_table *tbl, - struct net *net, const void *pkey, - struct net_device *dev) +int pneigh_create(struct neigh_table *tbl, struct net *net, + const void *pkey, struct net_device *dev, + u32 flags, u8 protocol, bool permanent) { struct pneigh_entry *n; unsigned int key_len; u32 hash_val; + int err = 0; mutex_lock(&tbl->phash_lock); n = pneigh_lookup(tbl, net, pkey, dev); if (n) - goto out; + goto update; key_len = tbl->key_len; n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); - if (!n) + if (!n) { + err = -ENOBUFS; goto out; + } write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); @@ -774,16 +777,20 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, if (tbl->pconstructor && tbl->pconstructor(n)) { netdev_put(dev, &n->dev_tracker); kfree(n); - n = NULL; + err = -ENOBUFS; goto out; } hash_val = pneigh_hash(pkey, key_len); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); +update: + WRITE_ONCE(n->flags, flags); + n->permanent = permanent; + WRITE_ONCE(n->protocol, protocol); out: mutex_unlock(&tbl->phash_lock); - return n; + return err; } static void pneigh_destroy(struct rcu_head *rcu) @@ -2015,22 +2022,13 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); if (ndm_flags & NTF_PROXY) { - struct pneigh_entry *pn; - if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } - err = -ENOBUFS; - pn = pneigh_create(tbl, net, dst, dev); - if (pn) { - WRITE_ONCE(pn->flags, ndm_flags); - pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT); - if (protocol) - WRITE_ONCE(pn->protocol, protocol); - err = 0; - } + err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol, + !!(ndm->ndm_state & NUD_PERMANENT)); goto out; } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index d93b5735b0ba..5cfc1c939673 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1089,9 +1089,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (!pneigh_create(&arp_tbl, net, &ip, dev)) - return -ENOBUFS; - return 0; + return pneigh_create(&arp_tbl, net, &ip, dev, 0, 0, false); } return arp_req_set_proxy(net, dev, 1);