From 05068eaa67b29963c1249c3032658968f64993e6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:19 +0000 Subject: [PATCH 01/15] selftest: net: Add basic functionality tests for ipmr. The new test exercise paths, where RTNL is needed, to catch lockdep splat: setsockopt MRT_INIT / MRT_DONE MRT_ADD_VIF / MRT_DEL_VIF MRT_ADD_MFC / MRT_DEL_MFC / MRT_ADD_MFC_PROXY / MRT_DEL_MFC_PROXY MRT_TABLE MRT_FLUSH rtnetlink RTM_NEWROUTE RTM_DELROUTE NETDEV_UNREGISTER I will extend this to cover IPv6 setsockopt() later. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- .../selftests/net/forwarding/.gitignore | 1 + .../testing/selftests/net/forwarding/Makefile | 4 + tools/testing/selftests/net/forwarding/ipmr.c | 455 ++++++++++++++++++ 3 files changed, 460 insertions(+) create mode 100644 tools/testing/selftests/net/forwarding/ipmr.c diff --git a/tools/testing/selftests/net/forwarding/.gitignore b/tools/testing/selftests/net/forwarding/.gitignore index 2dea317f12e7..418ff96c52ef 100644 --- a/tools/testing/selftests/net/forwarding/.gitignore +++ b/tools/testing/selftests/net/forwarding/.gitignore @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only forwarding.config +ipmr diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index ff4a00d91a26..bbaf4d937dd8 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -133,6 +133,10 @@ TEST_FILES := \ tc_common.sh \ # end of TEST_FILES +TEST_GEN_PROGS := \ + ipmr +# end of TEST_GEN_PROGS + TEST_INCLUDES := \ $(wildcard ../lib/sh/*.sh) \ ../lib.sh \ diff --git a/tools/testing/selftests/net/forwarding/ipmr.c b/tools/testing/selftests/net/forwarding/ipmr.c new file mode 100644 index 000000000000..df870aad9ead --- /dev/null +++ b/tools/testing/selftests/net/forwarding/ipmr.c @@ -0,0 +1,455 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2026 Google LLC */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kselftest_harness.h" + +FIXTURE(ipmr) +{ + int netlink_sk; + int raw_sk; + int veth_ifindex; +}; + +FIXTURE_VARIANT(ipmr) +{ + int family; + int protocol; + int level; + int opts[MRT_MAX - MRT_BASE + 1]; +}; + +FIXTURE_VARIANT_ADD(ipmr, ipv4) +{ + .family = AF_INET, + .protocol = IPPROTO_IGMP, + .level = IPPROTO_IP, + .opts = { + MRT_INIT, + MRT_DONE, + MRT_ADD_VIF, + MRT_DEL_VIF, + MRT_ADD_MFC, + MRT_DEL_MFC, + MRT_VERSION, + MRT_ASSERT, + MRT_PIM, + MRT_TABLE, + MRT_ADD_MFC_PROXY, + MRT_DEL_MFC_PROXY, + MRT_FLUSH, + }, +}; + +struct mfc_attr { + int table; + __u32 origin; + __u32 group; + int ifindex; + bool proxy; +}; + +static struct rtattr *nl_add_rtattr(struct nlmsghdr *nlmsg, struct rtattr *rta, + int type, const void *data, int len) +{ + int unused = 0; + + rta->rta_type = type; + rta->rta_len = RTA_LENGTH(len); + memcpy(RTA_DATA(rta), data, len); + + nlmsg->nlmsg_len += NLMSG_ALIGN(rta->rta_len); + + return RTA_NEXT(rta, unused); +} + +static int nl_sendmsg_mfc(struct __test_metadata *_metadata, FIXTURE_DATA(ipmr) *self, + __u16 nlmsg_type, struct mfc_attr *mfc_attr) +{ + struct { + struct nlmsghdr nlmsg; + struct rtmsg rtm; + char buf[4096]; + } req = { + .nlmsg = { + .nlmsg_len = NLMSG_LENGTH(sizeof(req.rtm)), + /* ipmr does not care about NLM_F_CREATE and NLM_F_EXCL ... */ + .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, + .nlmsg_type = nlmsg_type, + }, + .rtm = { + /* hard requirements in rtm_to_ipmr_mfcc() */ + .rtm_family = RTNL_FAMILY_IPMR, + .rtm_dst_len = 32, + .rtm_type = RTN_MULTICAST, + .rtm_scope = RT_SCOPE_UNIVERSE, + .rtm_protocol = RTPROT_MROUTED, + }, + }; + struct nlmsghdr *nlmsg = &req.nlmsg; + struct nlmsgerr *errmsg; + struct rtattr *rta; + int err; + + rta = (struct rtattr *)&req.buf; + rta = nl_add_rtattr(nlmsg, rta, RTA_TABLE, &mfc_attr->table, sizeof(mfc_attr->table)); + rta = nl_add_rtattr(nlmsg, rta, RTA_SRC, &mfc_attr->origin, sizeof(mfc_attr->origin)); + rta = nl_add_rtattr(nlmsg, rta, RTA_DST, &mfc_attr->group, sizeof(mfc_attr->group)); + if (mfc_attr->ifindex) + rta = nl_add_rtattr(nlmsg, rta, RTA_IIF, &mfc_attr->ifindex, sizeof(mfc_attr->ifindex)); + if (mfc_attr->proxy) + rta = nl_add_rtattr(nlmsg, rta, RTA_PREFSRC, NULL, 0); + + err = send(self->netlink_sk, &req, req.nlmsg.nlmsg_len, 0); + ASSERT_EQ(err, req.nlmsg.nlmsg_len); + + memset(&req, 0, sizeof(req)); + + err = recv(self->netlink_sk, &req, sizeof(req), 0); + ASSERT_TRUE(NLMSG_OK(nlmsg, err)); + ASSERT_EQ(NLMSG_ERROR, nlmsg->nlmsg_type); + + errmsg = (struct nlmsgerr *)NLMSG_DATA(nlmsg); + return errmsg->error; +} + +FIXTURE_SETUP(ipmr) +{ + struct ifreq ifr = { + .ifr_name = "veth0", + }; + int err; + + err = unshare(CLONE_NEWNET); + ASSERT_EQ(0, err); + + self->netlink_sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + ASSERT_LE(0, self->netlink_sk); + + self->raw_sk = socket(variant->family, SOCK_RAW, variant->protocol); + ASSERT_LT(0, self->raw_sk); + + err = system("ip link add veth0 type veth peer veth1"); + ASSERT_EQ(0, err); + + err = ioctl(self->raw_sk, SIOCGIFINDEX, &ifr); + ASSERT_EQ(0, err); + + self->veth_ifindex = ifr.ifr_ifindex; +} + +FIXTURE_TEARDOWN(ipmr) +{ + close(self->raw_sk); + close(self->netlink_sk); +} + +TEST_F(ipmr, mrt_init) +{ + int err, val = 0; /* any value is ok, but size must be int for MRT_INIT. */ + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_INIT - MRT_BASE], + &val, sizeof(val)); + ASSERT_EQ(0, err); + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_DONE - MRT_BASE], + &val, sizeof(val)); + ASSERT_EQ(0, err); +} + +TEST_F(ipmr, mrt_add_vif_register) +{ + struct vifctl vif = { + .vifc_vifi = 0, + .vifc_flags = VIFF_REGISTER, + }; + int err; + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); + + err = system("cat /proc/net/ip_mr_vif | grep -q pimreg"); + ASSERT_EQ(0, err); + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); +} + +TEST_F(ipmr, mrt_del_vif_unreg) +{ + struct vifctl vif = { + .vifc_vifi = 0, + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }; + int err; + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); + + err = system("cat /proc/net/ip_mr_vif | grep -q veth0"); + ASSERT_EQ(0, err); + + /* VIF is removed along with its device. */ + err = system("ip link del veth0"); + ASSERT_EQ(0, err); + + /* mrt->vif_table[veth_ifindex]->dev is NULL. */ + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(-1, err); + ASSERT_EQ(EADDRNOTAVAIL, errno); +} + +TEST_F(ipmr, mrt_del_vif_netns_dismantle) +{ + struct vifctl vif = { + .vifc_vifi = 0, + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }; + int err; + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); + + /* Let cleanup_net() remove veth0 and VIF. */ +} + +TEST_F(ipmr, mrt_add_mfc) +{ + struct mfcctl mfc = {}; + int err; + + /* MRT_ADD_MFC / MRT_ADD_MFC_PROXY does not need vif to exist (unlike netlink). */ + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_MFC - MRT_BASE], + &mfc, sizeof(mfc)); + ASSERT_EQ(0, err); + + /* (0.0.0.0 -> 0.0.0.0) */ + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + ASSERT_EQ(0, err); + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_DEL_MFC - MRT_BASE], + &mfc, sizeof(mfc)); +} + +TEST_F(ipmr, mrt_add_mfc_proxy) +{ + struct mfcctl mfc = {}; + int err; + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_MFC_PROXY - MRT_BASE], + &mfc, sizeof(mfc)); + ASSERT_EQ(0, err); + + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + ASSERT_EQ(0, err); + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_DEL_MFC_PROXY - MRT_BASE], + &mfc, sizeof(mfc)); +} + +TEST_F(ipmr, mrt_add_mfc_netlink) +{ + struct vifctl vif = { + .vifc_vifi = 0, + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }; + struct mfc_attr mfc_attr = { + .table = RT_TABLE_DEFAULT, + .origin = 0, + .group = 0, + .ifindex = self->veth_ifindex, + .proxy = false, + }; + int err; + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); + + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + ASSERT_EQ(0, err); + + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + ASSERT_EQ(0, err); + + err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); + ASSERT_EQ(0, err); +} + +TEST_F(ipmr, mrt_add_mfc_netlink_proxy) +{ + struct vifctl vif = { + .vifc_vifi = 0, + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }; + struct mfc_attr mfc_attr = { + .table = RT_TABLE_DEFAULT, + .origin = 0, + .group = 0, + .ifindex = self->veth_ifindex, + .proxy = true, + }; + int err; + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); + + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + ASSERT_EQ(0, err); + + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + ASSERT_EQ(0, err); + + err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); + ASSERT_EQ(0, err); +} + +TEST_F(ipmr, mrt_add_mfc_netlink_no_vif) +{ + struct mfc_attr mfc_attr = { + .table = RT_TABLE_DEFAULT, + .origin = 0, + .group = 0, + .proxy = false, + }; + int err; + + /* netlink always requires RTA_IIF of an existing vif. */ + mfc_attr.ifindex = 0; + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + ASSERT_EQ(-ENFILE, err); + + /* netlink always requires RTA_IIF of an existing vif. */ + mfc_attr.ifindex = self->veth_ifindex; + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + ASSERT_EQ(-ENFILE, err); +} + +TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle) +{ + struct vifctl vifs[2] = { + { + .vifc_vifi = 0, + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }, + { + .vifc_vifi = 1, + .vifc_flags = VIFF_REGISTER, + } + }; + struct mfc_attr mfc_attr = { + .table = RT_TABLE_DEFAULT, + .origin = 0, + .group = 0, + .ifindex = self->veth_ifindex, + .proxy = false, + }; + int i, err; + + for (i = 0; i < 2; i++) { + /* Create 2 VIFs just to avoid -ENFILE later. */ + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vifs[i], sizeof(vifs[i])); + ASSERT_EQ(0, err); + } + + /* Create a MFC for mrt->vif_table[0]. */ + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + ASSERT_EQ(0, err); + + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + ASSERT_EQ(0, err); + + /* Remove mrt->vif_table[0]. */ + err = system("ip link del veth0"); + ASSERT_EQ(0, err); + + /* MFC entry is NOT removed even if the tied VIF is removed... */ + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + ASSERT_EQ(0, err); + + /* ... and netlink is not capable of removing such an entry + * because netlink always requires a valid RTA_IIF ... :/ + */ + err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); + ASSERT_EQ(-ENODEV, err); + + /* It can be removed by setsockopt(), but let cleanup_net() remove this time. */ +} + +TEST_F(ipmr, mrt_table_flush) +{ + struct vifctl vif = { + .vifc_vifi = 0, + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }; + struct mfc_attr mfc_attr = { + .origin = 0, + .group = 0, + .ifindex = self->veth_ifindex, + .proxy = false, + }; + int table_id = 92; + int err, flags; + + /* Set a random table id rather than RT_TABLE_DEFAULT. + * Note that /proc/net/ip_mr_{vif,cache} only supports RT_TABLE_DEFAULT. + */ + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_TABLE - MRT_BASE], + &table_id, sizeof(table_id)); + ASSERT_EQ(0, err); + + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], + &vif, sizeof(vif)); + ASSERT_EQ(0, err); + + mfc_attr.table = table_id; + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + ASSERT_EQ(0, err); + + /* Flush mrt->vif_table[] and all caches. */ + flags = MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | + MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC; + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_FLUSH - MRT_BASE], + &flags, sizeof(flags)); + ASSERT_EQ(0, err); +} + +TEST_HARNESS_MAIN From 261950e0390b70f1f17947423a36b8d9baae80f2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:20 +0000 Subject: [PATCH 02/15] ipmr: Annotate access to mrt->mroute_do_{pim,assert,wrvifwhole}. These fields in struct mr_table are updated in ip_mroute_setsockopt() under RTNL: * mroute_do_pim * mroute_do_assert * mroute_do_wrvifwhole However, ip_mroute_getsockopt() does not hold RTNL and read the first two fields locklessly, and ip_mr_forward() reads all the three under RCU. pim_rcv_v1() also reads mroute_do_pim locklessly. Let's use WRITE_ONCE() and READ_ONCE() for them. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 131382c388e9..970f173654c7 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1506,7 +1506,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, ret = -EFAULT; break; } - mrt->mroute_do_assert = val; + WRITE_ONCE(mrt->mroute_do_assert, val); break; case MRT_PIM: if (!ipmr_pimsm_enabled()) { @@ -1525,9 +1525,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE); val = !!val; if (val != mrt->mroute_do_pim) { - mrt->mroute_do_pim = val; - mrt->mroute_do_assert = val; - mrt->mroute_do_wrvifwhole = do_wrvifwhole; + WRITE_ONCE(mrt->mroute_do_pim, val); + WRITE_ONCE(mrt->mroute_do_assert, val); + WRITE_ONCE(mrt->mroute_do_wrvifwhole, do_wrvifwhole); } break; case MRT_TABLE: @@ -1610,10 +1610,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, case MRT_PIM: if (!ipmr_pimsm_enabled()) return -ENOPROTOOPT; - val = mrt->mroute_do_pim; + val = READ_ONCE(mrt->mroute_do_pim); break; case MRT_ASSERT: - val = mrt->mroute_do_assert; + val = READ_ONCE(mrt->mroute_do_assert); break; default: return -ENOPROTOOPT; @@ -2037,20 +2037,20 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, atomic_long_inc(&c->_c.mfc_un.res.wrong_if); - if (true_vifi >= 0 && mrt->mroute_do_assert && + if (true_vifi >= 0 && READ_ONCE(mrt->mroute_do_assert) && /* pimsm uses asserts, when switching from RPT to SPT, * so that we cannot check that packet arrived on an oif. * It is bad, but otherwise we would need to move pretty * large chunk of pimd to kernel. Ough... --ANK */ - (mrt->mroute_do_pim || + (READ_ONCE(mrt->mroute_do_pim) || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); - if (mrt->mroute_do_wrvifwhole) + if (READ_ONCE(mrt->mroute_do_wrvifwhole)) ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRVIFWHOLE); } @@ -2358,7 +2358,7 @@ int pim_rcv_v1(struct sk_buff *skb) mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) goto drop; - if (!mrt->mroute_do_pim || + if (!READ_ONCE(mrt->mroute_do_pim) || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) goto drop; From 402a8111d7becb4220a94f5684edfbd5d4668ddb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:21 +0000 Subject: [PATCH 03/15] ipmr: Convert ipmr_rtm_dumplink() to RCU. net->ipv4.mr_tables is updated under RTNL and can be read safely under RCU. Once created, the multicast route tables are not removed until netns dismantle. ipmr_rtm_dumplink() does not need RTNL protection for ipmr_for_each_table() and ipmr_fill_table() if RCU is held. Even if mrt->maxvif changes concurrently, ipmr_fill_vif() returns true to continue dumping the next table. Let's convert it to RCU. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-4-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 970f173654c7..eecc79a835d1 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2901,12 +2901,13 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) || nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) || nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM, - mrt->mroute_reg_vif_num) || + READ_ONCE(mrt->mroute_reg_vif_num)) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, - mrt->mroute_do_assert) || - nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) || + READ_ONCE(mrt->mroute_do_assert)) || + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, + READ_ONCE(mrt->mroute_do_pim)) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, - mrt->mroute_do_wrvifwhole)) + READ_ONCE(mrt->mroute_do_wrvifwhole))) return false; return true; @@ -2919,7 +2920,7 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) struct vif_device *vif; vif = &mrt->vif_table[vifid]; - vif_dev = rtnl_dereference(vif->dev); + vif_dev = vif_dev_read(vif); /* if the VIF doesn't exist just continue */ if (!vif_dev) return true; @@ -2928,16 +2929,16 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) if (!vif_nest) return false; - if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) || + if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, READ_ONCE(vif_dev->ifindex)) || nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) || nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) || - nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in, + nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, READ_ONCE(vif->bytes_in), IPMRA_VIFA_PAD) || - nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out, + nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, READ_ONCE(vif->bytes_out), IPMRA_VIFA_PAD) || - nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in, + nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, READ_ONCE(vif->pkt_in), IPMRA_VIFA_PAD) || - nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out, + nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, READ_ONCE(vif->pkt_out), IPMRA_VIFA_PAD) || nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) || nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) { @@ -2992,6 +2993,8 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; s_e = cb->args[1]; + rcu_read_lock(); + ipmr_for_each_table(mrt, net) { struct nlattr *vifs, *af; struct ifinfomsg *hdr; @@ -3026,7 +3029,7 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) nlmsg_end(skb, nlh); goto out; } - for (i = 0; i < mrt->maxvif; i++) { + for (i = 0; i < READ_ONCE(mrt->maxvif); i++) { if (e < s_e) goto skip_entry; if (!ipmr_fill_vif(mrt, i, skb)) { @@ -3048,6 +3051,8 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) } out: + rcu_read_unlock(); + cb->args[1] = e; cb->args[0] = t; @@ -3287,7 +3292,7 @@ static struct pernet_operations ipmr_net_ops = { static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = { {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETLINK, - .dumpit = ipmr_rtm_dumplink}, + .dumpit = ipmr_rtm_dumplink, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_NEWROUTE, .doit = ipmr_rtm_route}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_DELROUTE, From 2bd6c9d600d66497f5293dbb8ec61a5e80f13e64 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:22 +0000 Subject: [PATCH 04/15] ipmr: Use MAXVIFS in mroute_msgsize(). mroute_msgsize() calculates skb size needed for ipmr_fill_mroute(). The size differs based on mrt->maxvif. We will drop RTNL for ipmr_rtm_getroute() and mrt->maxvif may change under RCU. To avoid -EMSGSIZE, let's calculate the size with the maximum value of mrt->maxvif, MAXVIFS. struct rtnexthop is 8 bytes and MAXVIFS is 32, so the maximum delta is 256 bytes, which is small enough. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index eecc79a835d1..9f2dd726affb 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2510,7 +2510,7 @@ static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, cmd, flags); } -static size_t mroute_msgsize(bool unresolved, int maxvif) +static size_t mroute_msgsize(bool unresolved) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) @@ -2523,7 +2523,7 @@ static size_t mroute_msgsize(bool unresolved, int maxvif) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ - + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) + + MAXVIFS * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; @@ -2538,8 +2538,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS, - mrt->maxvif), + skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS), GFP_ATOMIC); if (!skb) goto errout; @@ -2711,7 +2710,7 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout_free; } - skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL); + skb = nlmsg_new(mroute_msgsize(false), GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout_free; From 295a17b3eae97910c2664e7905a903b483c4089c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:23 +0000 Subject: [PATCH 05/15] ipmr: Convert ipmr_rtm_getroute() to RCU. ipmr_rtm_getroute() calls __ipmr_get_table(), ipmr_cache_find(), and ipmr_fill_mroute(). The table is not removed until netns dismantle, and net->ipv4.mr_tables is managed with RCU list API, so __ipmr_get_table() is safe under RCU. struct mfc_cache is freed by mr_cache_put() after RCU grace period, so we can use ipmr_cache_find() under RCU. rcu_read_lock() around it was just to avoid lockdep splat for rhl_for_each_entry_rcu(). ipmr_fill_mroute() calls mr_fill_mroute(), which properly uses RCU. Let's drop RTNL for ipmr_rtm_getroute() and use RCU instead. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-6-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 44 +++++++++++++++++++++++--------------------- net/ipv4/ipmr_base.c | 4 ++-- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9f2dd726affb..cb1a5e3a6296 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2680,9 +2680,9 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX + 1]; - struct sk_buff *skb = NULL; struct mfc_cache *cache; struct mr_table *mrt; + struct sk_buff *skb; __be32 src, grp; u32 tableid; int err; @@ -2695,39 +2695,40 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, grp = nla_get_in_addr_default(tb[RTA_DST], 0); tableid = nla_get_u32_default(tb[RTA_TABLE], 0); - mrt = __ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT); - if (!mrt) { - err = -ENOENT; - goto errout_free; - } - - /* entries are added/deleted only under RTNL */ - rcu_read_lock(); - cache = ipmr_cache_find(mrt, src, grp); - rcu_read_unlock(); - if (!cache) { - err = -ENOENT; - goto errout_free; - } - skb = nlmsg_new(mroute_msgsize(false), GFP_KERNEL); if (!skb) { err = -ENOBUFS; - goto errout_free; + goto errout; + } + + rcu_read_lock(); + + mrt = __ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT); + if (!mrt) { + err = -ENOENT; + goto errout_unlock; + } + + cache = ipmr_cache_find(mrt, src, grp); + if (!cache) { + err = -ENOENT; + goto errout_unlock; } err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0); if (err < 0) - goto errout_free; + goto errout_unlock; + + rcu_read_unlock(); err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); - errout: return err; -errout_free: +errout_unlock: + rcu_read_unlock(); kfree_skb(skb); goto errout; } @@ -3297,7 +3298,8 @@ static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = { {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_DELROUTE, .doit = ipmr_rtm_route}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETROUTE, - .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute}, + .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; int __init ip_mr_init(void) diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 2d62526406ca..b0fd9ffa01a2 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -223,7 +223,7 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, rcu_read_lock(); vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev); - if (vif_dev && nla_put_u32(skb, RTA_IIF, vif_dev->ifindex) < 0) { + if (vif_dev && nla_put_u32(skb, RTA_IIF, READ_ONCE(vif_dev->ifindex)) < 0) { rcu_read_unlock(); return -EMSGSIZE; } @@ -252,7 +252,7 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - nhp->rtnh_ifindex = vif_dev->ifindex; + nhp->rtnh_ifindex = READ_ONCE(vif_dev->ifindex); nhp->rtnh_len = sizeof(*nhp); } } From 2c698bab294aa273dacd8d6b72db1d79ef994385 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:24 +0000 Subject: [PATCH 06/15] ipmr: Convert ipmr_rtm_dumproute() to RCU. ipmr_rtm_dumproute() calls mr_table_dump() or mr_rtm_dumproute(), and mr_rtm_dumproute() finally calls mr_table_dump(). mr_table_dump() calls the passed function, _ipmr_fill_mroute(). _ipmr_fill_mroute() is a wrapper of ipmr_fill_mroute() to cast struct mr_mfc * to struct mfc_cache *. ipmr_fill_mroute() can be already called safely under RCU. Let's convert ipmr_rtm_dumproute() to RCU. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index cb1a5e3a6296..5c8508788fb6 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2736,15 +2736,17 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { struct fib_dump_filter filter = { - .rtnl_held = true, + .rtnl_held = false, }; int err; + rcu_read_lock(); + if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh, &filter, cb); if (err < 0) - return err; + goto out; } if (filter.table_id) { @@ -2752,19 +2754,28 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) mrt = __ipmr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { - if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR) - return skb->len; + if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR) { + err = skb->len; + goto out; + } NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist"); - return -ENOENT; + err = -ENOENT; + goto out; } + err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute, &mfc_unres_lock, &filter); - return skb->len ? : err; + err = skb->len ? : err; + goto out; } - return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, - _ipmr_fill_mroute, &mfc_unres_lock, &filter); + err = mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, + _ipmr_fill_mroute, &mfc_unres_lock, &filter); +out: + rcu_read_unlock(); + + return err; } static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = { @@ -3299,7 +3310,7 @@ static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = { .doit = ipmr_rtm_route}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETROUTE, .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute, - .flags = RTNL_FLAG_DOIT_UNLOCKED}, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, }; int __init ip_mr_init(void) From 3810f9529dc7f784b5b958b2a018bb6996cf9077 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:25 +0000 Subject: [PATCH 07/15] ipmr: Move unregister_netdevice_many() out of mroute_clean_tables(). This is a prep commit to convert ipmr_net_exit_batch() to ->exit_rtnl(). Let's move unregister_netdevice_many() in mroute_clean_tables() to its callers. As a bonus, mrtsock_destruct() can do batching for all tables. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-8-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5c8508788fb6..d15e05662b09 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -112,7 +112,8 @@ static int ipmr_cache_report(const struct mr_table *mrt, static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); -static void mroute_clean_tables(struct mr_table *mrt, int flags); +static void mroute_clean_tables(struct mr_table *mrt, int flags, + struct list_head *dev_kill_list); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -427,12 +428,15 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) static void ipmr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); + LIST_HEAD(dev_kill_list); WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | - MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC); + MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC, + &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } @@ -1293,12 +1297,12 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, } /* Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr_table *mrt, int flags) +static void mroute_clean_tables(struct mr_table *mrt, int flags, + struct list_head *dev_kill_list) { struct net *net = read_pnet(&mrt->net); - struct mr_mfc *c, *tmp; struct mfc_cache *cache; - LIST_HEAD(list); + struct mr_mfc *c, *tmp; int i; /* Shut down all active vif entries */ @@ -1308,9 +1312,8 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags) !(flags & MRT_FLUSH_VIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS))) continue; - vif_delete(mrt, i, 0, &list); + vif_delete(mrt, i, 0, dev_kill_list); } - unregister_netdevice_many(&list); } /* Wipe the cache */ @@ -1349,9 +1352,11 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags) static void mrtsock_destruct(struct sock *sk) { struct net *net = sock_net(sk); + LIST_HEAD(dev_kill_list); struct mr_table *mrt; rtnl_lock(); + ipmr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; @@ -1360,9 +1365,13 @@ static void mrtsock_destruct(struct sock *sk) NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); - mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC); + mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC, + &dev_kill_list); } } + + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); } @@ -1485,7 +1494,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, sk == rtnl_dereference(mrt->mroute_sk), parent); break; - case MRT_FLUSH: + case MRT_FLUSH: { + LIST_HEAD(dev_kill_list); + if (optlen != sizeof(val)) { ret = -EINVAL; break; @@ -1494,8 +1505,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, ret = -EFAULT; break; } - mroute_clean_tables(mrt, val); + + mroute_clean_tables(mrt, val, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); break; + } /* Control PIM assert. */ case MRT_ASSERT: if (optlen != sizeof(val)) { From b7fdc3cfb60a4dd80bb71c818fe433d8b3449cf3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:26 +0000 Subject: [PATCH 08/15] ipmr: Move unregister_netdevice_many() out of ipmr_free_table(). This is a prep commit to convert ipmr_net_exit_batch() to ->exit_rtnl(). Let's move unregister_netdevice_many() in ipmr_free_table() to its callers. Now ipmr_rules_exit() can do batching all tables per netns. Note that later we will remove RTNL and unregister_netdevice_many() in ipmr_rules_init(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-9-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d15e05662b09..7e2aa2026f01 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -102,7 +102,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock); static struct kmem_cache *mrt_cachep __ro_after_init; static struct mr_table *ipmr_new_table(struct net *net, u32 id); -static void ipmr_free_table(struct mr_table *mrt); +static void ipmr_free_table(struct mr_table *mrt, + struct list_head *dev_kill_list); static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, @@ -251,6 +252,7 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { static int __net_init ipmr_rules_init(struct net *net) { struct fib_rules_ops *ops; + LIST_HEAD(dev_kill_list); struct mr_table *mrt; int err; @@ -275,7 +277,8 @@ static int __net_init ipmr_rules_init(struct net *net) err2: rtnl_lock(); - ipmr_free_table(mrt); + ipmr_free_table(mrt, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); err1: fib_rules_unregister(ops); @@ -285,12 +288,15 @@ static int __net_init ipmr_rules_init(struct net *net) static void __net_exit ipmr_rules_exit(struct net *net) { struct mr_table *mrt, *next; + LIST_HEAD(dev_kill_list); ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); - ipmr_free_table(mrt); + ipmr_free_table(mrt, &dev_kill_list); } + + unregister_netdevice_many(&dev_kill_list); fib_rules_unregister(net->ipv4.mr_rules_ops); } @@ -349,8 +355,13 @@ static int __net_init ipmr_rules_init(struct net *net) static void __net_exit ipmr_rules_exit(struct net *net) { + LIST_HEAD(dev_kill_list); + ASSERT_RTNL(); - ipmr_free_table(net->ipv4.mrt); + + ipmr_free_table(net->ipv4.mrt, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); + net->ipv4.mrt = NULL; } @@ -425,18 +436,16 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) ipmr_expire_process, ipmr_new_table_set); } -static void ipmr_free_table(struct mr_table *mrt) +static void ipmr_free_table(struct mr_table *mrt, struct list_head *dev_kill_list) { struct net *net = read_pnet(&mrt->net); - LIST_HEAD(dev_kill_list); WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC, - &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); + dev_kill_list); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } From b22b01867406bcafbf61b61dccdf5b0afbd89fdc Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:27 +0000 Subject: [PATCH 09/15] ipmr: Convert ipmr_net_exit_batch() to ->exit_rtnl(). ipmr_net_ops uses ->exit_batch() to acquire RTNL only once for dying network namespaces. ipmr does not depend on the ordering of ->exit_rtnl() and ->exit_batch() of other pernet_operations (unlike fib_net_ops). Once ipmr_free_table() is called and all devices are queued for destruction in ->exit_rtnl(), later during NETDEV_UNREGISTER, ipmr_device_event() will not see anything in vif table and just do nothing. Let's convert ipmr_net_exit_batch() to ->exit_rtnl(). Note that fib_rules_unregister() does not need RTNL and we will remove RTNL and unregister_netdevice_many() in ipmr_net_init(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-10-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 7e2aa2026f01..72761c8b2930 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -285,18 +285,17 @@ static int __net_init ipmr_rules_init(struct net *net) return err; } -static void __net_exit ipmr_rules_exit(struct net *net) +static void __net_exit ipmr_rules_exit_rtnl(struct net *net, + struct list_head *dev_kill_list) { struct mr_table *mrt, *next; - LIST_HEAD(dev_kill_list); ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); - ipmr_free_table(mrt, &dev_kill_list); + ipmr_free_table(mrt, dev_kill_list); } - unregister_netdevice_many(&dev_kill_list); fib_rules_unregister(net->ipv4.mr_rules_ops); } @@ -353,14 +352,12 @@ static int __net_init ipmr_rules_init(struct net *net) return 0; } -static void __net_exit ipmr_rules_exit(struct net *net) +static void __net_exit ipmr_rules_exit_rtnl(struct net *net, + struct list_head *dev_kill_list) { - LIST_HEAD(dev_kill_list); - ASSERT_RTNL(); - ipmr_free_table(net->ipv4.mrt, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); + ipmr_free_table(net->ipv4.mrt, dev_kill_list); net->ipv4.mrt = NULL; } @@ -3264,6 +3261,7 @@ static void __net_exit ipmr_notifier_exit(struct net *net) /* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { + LIST_HEAD(dev_kill_list); int err; err = ipmr_notifier_init(net); @@ -3290,7 +3288,8 @@ static int __net_init ipmr_net_init(struct net *net) remove_proc_entry("ip_mr_vif", net->proc_net); proc_vif_fail: rtnl_lock(); - ipmr_rules_exit(net); + ipmr_rules_exit_rtnl(net, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); #endif ipmr_rules_fail: @@ -3308,20 +3307,16 @@ static void __net_exit ipmr_net_exit(struct net *net) ipmr_notifier_exit(net); } -static void __net_exit ipmr_net_exit_batch(struct list_head *net_list) +static void __net_exit ipmr_net_exit_rtnl(struct net *net, + struct list_head *dev_kill_list) { - struct net *net; - - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) - ipmr_rules_exit(net); - rtnl_unlock(); + ipmr_rules_exit_rtnl(net, dev_kill_list); } static struct pernet_operations ipmr_net_ops = { .init = ipmr_net_init, .exit = ipmr_net_exit, - .exit_batch = ipmr_net_exit_batch, + .exit_rtnl = ipmr_net_exit_rtnl, }; static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = { From 4a11adcd9eefb841d4595267bbd4df304a98ded6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:28 +0000 Subject: [PATCH 10/15] ipmr: Remove RTNL in ipmr_rules_init() and ipmr_net_init(). When ipmr_free_table() is called from ipmr_rules_init() or ipmr_net_init(), the netns is not yet published. Thus, no device should have been registered, and mroute_clean_tables() will not call vif_delete(), so unregister_netdevice_many() is unnecessary. unregister_netdevice_many() does nothing if the list is empty, but it requires RTNL due to the unconditional ASSERT_RTNL() at the entry of unregister_netdevice_many_notify(). Let's remove unnecessary RTNL and ASSERT_RTNL() and instead add WARN_ON_ONCE() in ipmr_free_table(). Note that we use a local list for the new WARN_ON_ONCE() because dev_kill_list passed from ipmr_rules_exit_rtnl() may have some devices when other ops->init() fails after ipmr durnig setup_net(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-11-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 72761c8b2930..c22bcaead348 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -276,10 +276,7 @@ static int __net_init ipmr_rules_init(struct net *net) return 0; err2: - rtnl_lock(); ipmr_free_table(mrt, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); err1: fib_rules_unregister(ops); return err; @@ -290,7 +287,6 @@ static void __net_exit ipmr_rules_exit_rtnl(struct net *net, { struct mr_table *mrt, *next; - ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); ipmr_free_table(mrt, dev_kill_list); @@ -355,8 +351,6 @@ static int __net_init ipmr_rules_init(struct net *net) static void __net_exit ipmr_rules_exit_rtnl(struct net *net, struct list_head *dev_kill_list) { - ASSERT_RTNL(); - ipmr_free_table(net->ipv4.mrt, dev_kill_list); net->ipv4.mrt = NULL; @@ -436,15 +430,19 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) static void ipmr_free_table(struct mr_table *mrt, struct list_head *dev_kill_list) { struct net *net = read_pnet(&mrt->net); + LIST_HEAD(ipmr_dev_kill_list); WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC, - dev_kill_list); + &ipmr_dev_kill_list); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); + + WARN_ON_ONCE(!net_initialized(net) && !list_empty(&ipmr_dev_kill_list)); + list_splice(&ipmr_dev_kill_list, dev_kill_list); } /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ @@ -3287,10 +3285,7 @@ static int __net_init ipmr_net_init(struct net *net) proc_cache_fail: remove_proc_entry("ip_mr_vif", net->proc_net); proc_vif_fail: - rtnl_lock(); ipmr_rules_exit_rtnl(net, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); #endif ipmr_rules_fail: ipmr_notifier_exit(net); From 478c2add78b13e36d781d6891d5861e6e1eecef4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:29 +0000 Subject: [PATCH 11/15] ipmr: Call fib_rules_unregister() without RTNL. fib_rules_unregister() removes ops from net->rules_ops under spinlock, calls ops->delete() for each rule, and frees the ops. ipmr_rules_ops_template does not have ->delete(), and any operation does not require RTNL there. Let's move fib_rules_unregister() from ipmr_rules_exit_rtnl() to ipmr_net_exit(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-12-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c22bcaead348..07f2d4f8dcbe 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -282,6 +282,11 @@ static int __net_init ipmr_rules_init(struct net *net) return err; } +static void __net_exit ipmr_rules_exit(struct net *net) +{ + fib_rules_unregister(net->ipv4.mr_rules_ops); +} + static void __net_exit ipmr_rules_exit_rtnl(struct net *net, struct list_head *dev_kill_list) { @@ -291,8 +296,6 @@ static void __net_exit ipmr_rules_exit_rtnl(struct net *net, list_del(&mrt->list); ipmr_free_table(mrt, dev_kill_list); } - - fib_rules_unregister(net->ipv4.mr_rules_ops); } static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, @@ -348,6 +351,10 @@ static int __net_init ipmr_rules_init(struct net *net) return 0; } +static void __net_exit ipmr_rules_exit(struct net *net) +{ +} + static void __net_exit ipmr_rules_exit_rtnl(struct net *net, struct list_head *dev_kill_list) { @@ -3286,6 +3293,7 @@ static int __net_init ipmr_net_init(struct net *net) remove_proc_entry("ip_mr_vif", net->proc_net); proc_vif_fail: ipmr_rules_exit_rtnl(net, &dev_kill_list); + ipmr_rules_exit(net); #endif ipmr_rules_fail: ipmr_notifier_exit(net); @@ -3299,6 +3307,7 @@ static void __net_exit ipmr_net_exit(struct net *net) remove_proc_entry("ip_mr_cache", net->proc_net); remove_proc_entry("ip_mr_vif", net->proc_net); #endif + ipmr_rules_exit(net); ipmr_notifier_exit(net); } From 1c36d186a0c81f3b55b2722736163233b05f8756 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:30 +0000 Subject: [PATCH 12/15] ipmr: Define net->ipv4.{ipmr_notifier_ops,ipmr_seq} under CONFIG_IP_MROUTE. net->ipv4.ipmr_notifier_ops and net->ipv4.ipmr_seq are used only in net/ipv4/ipmr.c. Let's move these definitions under CONFIG_IP_MROUTE. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-13-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8e971c7bf164..380ff34c0233 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -279,6 +279,8 @@ struct netns_ipv4 { struct list_head mr_tables; struct fib_rules_ops *mr_rules_ops; #endif + struct fib_notifier_ops *ipmr_notifier_ops; + unsigned int ipmr_seq; /* protected by rtnl_mutex */ #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH struct sysctl_fib_multipath_hash_seed sysctl_fib_multipath_hash_seed; @@ -290,9 +292,6 @@ struct netns_ipv4 { struct fib_notifier_ops *notifier_ops; unsigned int fib_seq; /* writes protected by rtnl_mutex */ - struct fib_notifier_ops *ipmr_notifier_ops; - unsigned int ipmr_seq; /* protected by rtnl_mutex */ - atomic_t rt_genid; siphash_key_t ip_id_key; struct hlist_head *inet_addr_lst; From 4480d5fa1f6ebe7dfc546e14371d63c8b915a82d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:31 +0000 Subject: [PATCH 13/15] ipmr/ip6mr: Convert net->ipv[46].ipmr_seq to atomic_t. We will no longer hold RTNL for ipmr_mfc_add() and ipmr_mfc_delete(). MFC entry can be loosely connected with VIF by its index for mrt->vif_table[] (stored in mfc_parent), but the two tables are not synchronised. i.e. Even if VIF 1 is removed, MFC for VIF 1 is not automatically removed. The only field that the MFC/VIF interfaces share is net->ipv[46].ipmr_seq, which is protected by RTNL. Adding a new mutex for both just to protect a single field is overkill. Let's convert the field to atomic_t. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-14-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 8 ++++---- include/net/netns/ipv4.h | 2 +- include/net/netns/ipv6.h | 2 +- net/ipv4/ipmr.c | 4 ++-- net/ipv6/ip6mr.c | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0075f6e5c3da..0baa6f994da9 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -76,7 +76,7 @@ static inline int mr_call_vif_notifiers(struct net *net, struct vif_device *vif, struct net_device *vif_dev, unsigned short vif_index, u32 tb_id, - unsigned int *ipmr_seq) + atomic_t *ipmr_seq) { struct vif_entry_notifier_info info = { .info = { @@ -89,7 +89,7 @@ static inline int mr_call_vif_notifiers(struct net *net, }; ASSERT_RTNL(); - (*ipmr_seq)++; + atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } @@ -198,7 +198,7 @@ static inline int mr_call_mfc_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, - unsigned int *ipmr_seq) + atomic_t *ipmr_seq) { struct mfc_entry_notifier_info info = { .info = { @@ -209,7 +209,7 @@ static inline int mr_call_mfc_notifiers(struct net *net, }; ASSERT_RTNL(); - (*ipmr_seq)++; + atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 380ff34c0233..94dca64fec41 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -280,7 +280,7 @@ struct netns_ipv4 { struct fib_rules_ops *mr_rules_ops; #endif struct fib_notifier_ops *ipmr_notifier_ops; - unsigned int ipmr_seq; /* protected by rtnl_mutex */ + atomic_t ipmr_seq; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH struct sysctl_fib_multipath_hash_seed sysctl_fib_multipath_hash_seed; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 34bdb1308e8f..499e4288170f 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -118,7 +118,7 @@ struct netns_ipv6 { struct seg6_pernet_data *seg6_data; struct fib_notifier_ops *notifier_ops; struct fib_notifier_ops *ip6mr_notifier_ops; - unsigned int ipmr_seq; /* protected by rtnl_mutex */ + atomic_t ipmr_seq; struct { struct hlist_head head; spinlock_t lock; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 07f2d4f8dcbe..6ec73796d84d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3226,7 +3226,7 @@ static const struct net_protocol pim_protocol = { static unsigned int ipmr_seq_read(const struct net *net) { - return READ_ONCE(net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); + return atomic_read(&net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); } static int ipmr_dump(struct net *net, struct notifier_block *nb, @@ -3247,7 +3247,7 @@ static int __net_init ipmr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; - net->ipv4.ipmr_seq = 0; + atomic_set(&net->ipv4.ipmr_seq, 0); ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); if (IS_ERR(ops)) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e047a4680ab0..85010ff21c98 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1280,7 +1280,7 @@ static int ip6mr_device_event(struct notifier_block *this, static unsigned int ip6mr_seq_read(const struct net *net) { - return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); + return atomic_read(&net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, @@ -1305,7 +1305,7 @@ static int __net_init ip6mr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; - net->ipv6.ipmr_seq = 0; + atomic_set(&net->ipv6.ipmr_seq, 0); ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); if (IS_ERR(ops)) From 3c1e53e55418d4ca4040e281501643a96e227974 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:32 +0000 Subject: [PATCH 14/15] ipmr: Add dedicated mutex for mrt->{mfc_hash,mfc_cache_list}. We will no longer hold RTNL for ipmr_rtm_route() to modify the MFC hash table. Only __dev_get_by_index() in rtm_to_ipmr_mfcc() is the RTNL dependant, otherwise, we just need protection for mrt->mfc_hash and mrt->mfc_cache_list. Let's add a new mutex for ipmr_mfc_add(), ipmr_mfc_delete(), and mroute_clean_tables() (setsockopt(MRT_FLUSH or MRT_DONE)). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-15-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 1 + net/ipv4/ipmr.c | 28 ++++++++++++++++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 94dca64fec41..4c249aeaf7f1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -281,6 +281,7 @@ struct netns_ipv4 { #endif struct fib_notifier_ops *ipmr_notifier_ops; atomic_t ipmr_seq; + struct mutex mfc_mutex; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH struct sysctl_fib_multipath_hash_seed sysctl_fib_multipath_hash_seed; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 6ec73796d84d..d4983d8a9b2a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1329,6 +1329,8 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags, /* Wipe the cache */ if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) { + mutex_lock(&net->ipv4.mfc_mutex); + list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC))) @@ -1341,6 +1343,8 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags, mroute_netlink_event(mrt, cache, RTM_DELROUTE); mr_cache_put(c); } + + mutex_unlock(&net->ipv4.mfc_mutex); } if (flags & MRT_FLUSH_MFC) { @@ -1498,12 +1502,17 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, } if (parent == 0) parent = mfc.mfcc_parent; + + mutex_lock(&net->ipv4.mfc_mutex); + if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY) ret = ipmr_mfc_delete(mrt, &mfc, parent); else ret = ipmr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); + + mutex_unlock(&net->ipv4.mfc_mutex); break; case MRT_FLUSH: { LIST_HEAD(dev_kill_list); @@ -2913,21 +2922,26 @@ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - int ret, mrtsock, parent; - struct mr_table *tbl; + int ret, mrtsock = 0, parent; + struct mr_table *tbl = NULL; struct mfcctl mfcc; - mrtsock = 0; - tbl = NULL; ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack); if (ret < 0) return ret; parent = ret ? mfcc.mfcc_parent : -1; + + mutex_lock(&net->ipv4.mfc_mutex); + if (nlh->nlmsg_type == RTM_NEWROUTE) - return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent); + ret = ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent); else - return ipmr_mfc_delete(tbl, &mfcc, parent); + ret = ipmr_mfc_delete(tbl, &mfcc, parent); + + mutex_unlock(&net->ipv4.mfc_mutex); + + return ret; } static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) @@ -3269,6 +3283,8 @@ static int __net_init ipmr_net_init(struct net *net) LIST_HEAD(dev_kill_list); int err; + mutex_init(&net->ipv4.mfc_mutex); + err = ipmr_notifier_init(net); if (err) goto ipmr_notifier_fail; From bddafc06ca5ee1be4d10061f7954c6d6be5dc1d8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:33 +0000 Subject: [PATCH 15/15] ipmr: Don't hold RTNL for ipmr_rtm_route(). ipmr_mfc_add() and ipmr_mfc_delete() are already protected by a dedicated mutex. rtm_to_ipmr_mfcc() calls __ipmr_get_table(), __dev_get_by_index(), amd ipmr_find_vif(). Once __dev_get_by_index() is converted to dev_get_by_index_rcu(), we can move the other two functions under that same RCU section and drop RTNL for ipmr_rtm_route(). Let's do that conversion and drop ASSERT_RTNL() in mr_call_mfc_notifiers(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-16-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 1 - net/ipv4/ipmr.c | 34 +++++++++++++++++++++------------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0baa6f994da9..cf3374580f74 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -208,7 +208,6 @@ static inline int mr_call_mfc_notifiers(struct net *net, .tb_id = tb_id }; - ASSERT_RTNL(); atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d4983d8a9b2a..8a08d09b4c30 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1211,7 +1211,6 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; - /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); @@ -1238,7 +1237,6 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (mfc->mfcc_parent >= MAXVIFS) return -ENFILE; - /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); @@ -2853,10 +2851,10 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, { struct net_device *dev = NULL; u32 tblid = RT_TABLE_DEFAULT; + int ret, rem, iif = 0; struct mr_table *mrt; struct nlattr *attr; struct rtmsg *rtm; - int ret, rem; ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy, extack); @@ -2883,11 +2881,7 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr); break; case RTA_IIF: - dev = __dev_get_by_index(net, nla_get_u32(attr)); - if (!dev) { - ret = -ENODEV; - goto out; - } + iif = nla_get_u32(attr); break; case RTA_MULTIPATH: if (ipmr_nla_get_ttls(attr, mfcc) < 0) { @@ -2903,16 +2897,30 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, break; } } + + rcu_read_lock(); + mrt = __ipmr_get_table(net, tblid); if (!mrt) { ret = -ENOENT; - goto out; + goto unlock; } + + if (iif) { + dev = dev_get_by_index_rcu(net, iif); + if (!dev) { + ret = -ENODEV; + goto unlock; + } + + mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); + } + *mrtret = mrt; *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0; - if (dev) - mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); +unlock: + rcu_read_unlock(); out: return ret; } @@ -3343,9 +3351,9 @@ static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = { {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETLINK, .dumpit = ipmr_rtm_dumplink, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_NEWROUTE, - .doit = ipmr_rtm_route}, + .doit = ipmr_rtm_route, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_DELROUTE, - .doit = ipmr_rtm_route}, + .doit = ipmr_rtm_route, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETROUTE, .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},