From e3411e326fa48c9be09ba449330352ba698db698 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:09 +0200 Subject: [PATCH 01/15] net: ipv4: Add a flags argument to iptunnel_xmit(), udp_tunnel_xmit_skb() iptunnel_xmit() erases the contents of the SKB control block. In order to be able to set particular IPCB flags on the SKB, add a corresponding parameter, and propagate it to udp_tunnel_xmit_skb() as well. In one of the following patches, VXLAN driver will use this facility to mark packets as subject to IP multicast routing. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Acked-by: Antonio Quartulli Link: https://patch.msgid.link/89c9daf9f2dc088b6b92ccebcc929f51742de91f.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 9 ++++++--- drivers/net/bareudp.c | 4 ++-- drivers/net/geneve.c | 4 ++-- drivers/net/gtp.c | 10 ++++++---- drivers/net/ovpn/udp.c | 2 +- drivers/net/vxlan/vxlan_core.c | 2 +- drivers/net/wireguard/socket.c | 2 +- include/net/ip_tunnels.h | 2 +- include/net/udp_tunnel.h | 2 +- net/ipv4/ip_tunnel.c | 4 ++-- net/ipv4/ip_tunnel_core.c | 4 +++- net/ipv4/udp_tunnel_core.c | 5 +++-- net/ipv6/sit.c | 2 +- net/sctp/protocol.c | 3 ++- net/tipc/udp_media.c | 2 +- 15 files changed, 33 insertions(+), 24 deletions(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index fb130fde68c0..ed86537b2f61 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -1046,7 +1046,8 @@ static bool amt_send_membership_update(struct amt_dev *amt, amt->gw_port, amt->relay_port, false, - false); + false, + 0); amt_update_gw_status(amt, AMT_STATUS_SENT_UPDATE, true); return false; } @@ -1103,7 +1104,8 @@ static void amt_send_multicast_data(struct amt_dev *amt, amt->relay_port, tunnel->source_port, false, - false); + false, + 0); } static bool amt_send_membership_query(struct amt_dev *amt, @@ -1161,7 +1163,8 @@ static bool amt_send_membership_query(struct amt_dev *amt, amt->relay_port, tunnel->source_port, false, - false); + false, + 0); amt_update_relay_status(tunnel, AMT_STATUS_SENT_QUERY, true); return false; } diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index a9dffdcac805..5e613080d3f8 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -362,8 +362,8 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev, udp_tunnel_xmit_skb(rt, sock->sk, skb, saddr, info->key.u.ipv4.dst, tos, ttl, df, sport, bareudp->port, !net_eq(bareudp->net, dev_net(bareudp->dev)), - !test_bit(IP_TUNNEL_CSUM_BIT, - info->key.tun_flags)); + !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), + 0); return 0; free_dst: diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index ffc15a432689..c668e8b00ed2 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -921,8 +921,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, saddr, info->key.u.ipv4.dst, tos, ttl, df, sport, geneve->cfg.info.key.tp_dst, !net_eq(geneve->net, dev_net(geneve->dev)), - !test_bit(IP_TUNNEL_CSUM_BIT, - info->key.tun_flags)); + !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), + 0); return 0; } diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index d4dec741c7f4..14584793fe4e 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -446,7 +446,8 @@ static int gtp0_send_echo_resp_ip(struct gtp_dev *gtp, struct sk_buff *skb) htons(GTP0_PORT), htons(GTP0_PORT), !net_eq(sock_net(gtp->sk1u), dev_net(gtp->dev)), - false); + false, + 0); return 0; } @@ -704,7 +705,8 @@ static int gtp1u_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb) htons(GTP1U_PORT), htons(GTP1U_PORT), !net_eq(sock_net(gtp->sk1u), dev_net(gtp->dev)), - false); + false, + 0); return 0; } @@ -1304,7 +1306,7 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) pktinfo.gtph_port, pktinfo.gtph_port, !net_eq(sock_net(pktinfo.pctx->sk), dev_net(dev)), - false); + false, 0); break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) @@ -2405,7 +2407,7 @@ static int gtp_genl_send_echo_req(struct sk_buff *skb, struct genl_info *info) port, port, !net_eq(sock_net(sk), dev_net(gtp->dev)), - false); + false, 0); return 0; } diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c index bff00946eae2..d866e6bfda70 100644 --- a/drivers/net/ovpn/udp.c +++ b/drivers/net/ovpn/udp.c @@ -199,7 +199,7 @@ static int ovpn_udp4_output(struct ovpn_peer *peer, struct ovpn_bind *bind, transmit: udp_tunnel_xmit_skb(rt, sk, skb, fl.saddr, fl.daddr, 0, ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport, - fl.fl4_dport, false, sk->sk_no_check_tx); + fl.fl4_dport, false, sk->sk_no_check_tx, 0); ret = 0; err: local_bh_enable(); diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 97792de896b7..1cc18acd242d 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2522,7 +2522,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr, pkey->u.ipv4.dst, tos, ttl, df, - src_port, dst_port, xnet, !udp_sum); + src_port, dst_port, xnet, !udp_sum, 0); #if IS_ENABLED(CONFIG_IPV6) } else { struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 0414d7a6ce74..88e685667bc0 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -84,7 +84,7 @@ static int send4(struct wg_device *wg, struct sk_buff *skb, skb->ignore_df = 1; udp_tunnel_xmit_skb(rt, sock, skb, fl.saddr, fl.daddr, ds, ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport, - fl.fl4_dport, false, false); + fl.fl4_dport, false, false, 0); goto out; err: diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 0c3d571a04a1..8cf1380f3656 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -603,7 +603,7 @@ static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, - u8 tos, u8 ttl, __be16 df, bool xnet); + u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 2df3b8344eb5..28102c8fd8a8 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -150,7 +150,7 @@ static inline void udp_tunnel_drop_rx_info(struct net_device *dev) void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck); + bool xnet, bool nocheck, u16 ipcb_flags); int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 678b8f96e3e9..aaeb5d16f0c9 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -668,7 +668,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_adj_headroom(dev, headroom); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, - df, !net_eq(tunnel->net, dev_net(dev))); + df, !net_eq(tunnel->net, dev_net(dev)), 0); return; tx_error: DEV_STATS_INC(dev, tx_errors); @@ -857,7 +857,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_adj_headroom(dev, max_headroom); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, - df, !net_eq(tunnel->net, dev_net(dev))); + df, !net_eq(tunnel->net, dev_net(dev)), 0); return; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index f65d2f727381..cc9915543637 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -49,7 +49,8 @@ EXPORT_SYMBOL(ip6tun_encaps); void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, - __u8 tos, __u8 ttl, __be16 df, bool xnet) + __u8 tos, __u8 ttl, __be16 df, bool xnet, + u16 ipcb_flags) { int pkt_len = skb->len - skb_inner_network_offset(skb); struct net *net = dev_net(rt->dst.dev); @@ -62,6 +63,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, skb_clear_hash_if_not_l4(skb); skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + IPCB(skb)->flags = ipcb_flags; /* Push down and install the IP header. */ skb_push(skb, sizeof(struct iphdr)); diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 2326548997d3..9efd62505916 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck) + bool xnet, bool nocheck, u16 ipcb_flags) { struct udphdr *uh; @@ -185,7 +185,8 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb udp_set_csum(nocheck, skb, src, dst, skb->len); - iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); + iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet, + ipcb_flags); } EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a72dbca9e8fc..12496ba1b7d4 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1035,7 +1035,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, skb_set_inner_ipproto(skb, IPPROTO_IPV6); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, - df, !net_eq(tunnel->net, dev_net(dev))); + df, !net_eq(tunnel->net, dev_net(dev)), 0); return NETDEV_TX_OK; tx_error_icmp: diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index f402f90eb6b6..a5ccada55f2b 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1103,7 +1103,8 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) skb_set_inner_ipproto(skb, IPPROTO_SCTP); udp_tunnel_xmit_skb(dst_rtable(dst), sk, skb, fl4->saddr, fl4->daddr, dscp, ip4_dst_hoplimit(dst), df, - sctp_sk(sk)->udp_port, t->encap_port, false, false); + sctp_sk(sk)->udp_port, t->encap_port, false, false, + 0); return 0; } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 108a4cc2e001..87e8c1e6d550 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -197,7 +197,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, ttl = ip4_dst_hoplimit(&rt->dst); udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, dst->ipv4.s_addr, 0, ttl, 0, src->port, - dst->port, false, true); + dst->port, false, true, 0); #if IS_ENABLED(CONFIG_IPV6) } else { if (!ndst) { From 3b7bc938e0ada6a791103faae261dd2a770df86d Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:10 +0200 Subject: [PATCH 02/15] net: ipv4: ipmr: ipmr_queue_xmit(): Drop local variable `dev' The variable is used for caching of rt->dst.dev. The netdevice referenced therein does not change during the scope of validity of that local. At the same time, the local is only used twice, and each of these uses will end up in a different function in the following patches, further eliminating any use the local could have had. Drop the local altogether and inline the uses. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/c80600a4b51679fe78f429ccb6d60892c2f9e4de.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a7d09ae9d761..d2ac630bea3a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1859,7 +1859,6 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; - struct net_device *dev; struct rtable *rt; struct flowi4 fl4; int encap = 0; @@ -1898,8 +1897,6 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } - dev = rt->dst.dev; - if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not * allow to send ICMP, so that packets will disappear @@ -1910,7 +1907,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } - encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len; + encap += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; if (skb_cow(skb, encap)) { ip_rt_put(rt); @@ -1947,7 +1944,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * result in receiving multiple packets. */ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, - net, NULL, skb, skb->dev, dev, + net, NULL, skb, skb->dev, rt->dst.dev, ipmr_forward_finish); return; From b2e653bcff0f3eb43183393548f7142c176faa6d Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:11 +0200 Subject: [PATCH 03/15] net: ipv4: ipmr: Split ipmr_queue_xmit() in two Some of the work of ipmr_queue_xmit() is specific to IPMR forwarding, and should not take place on the output path. In order to allow reuse of the common parts, split the function into two: the ipmr_prepare_xmit() helper that takes care of the common bits, and the ipmr_queue_fwd_xmit(), which invokes the former and encapsulates the whole forwarding algorithm. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/4e8db165572a4f8bd29a723a801e854e9d20df4d.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d2ac630bea3a..74d45fd5d11e 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1853,8 +1853,8 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, /* Processing handlers for ipmr_forward, under rcu_read_lock() */ -static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, - int in_vifi, struct sk_buff *skb, int vifi) +static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; @@ -1865,7 +1865,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, vif_dev = vif_dev_read(vif); if (!vif_dev) - goto out_free; + return -1; if (vif->flags & VIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); @@ -1873,12 +1873,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT); - goto out_free; + return -1; } - if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) - goto out_free; - if (vif->flags & VIFF_TUNNEL) { rt = ip_route_output_ports(net, &fl4, NULL, vif->remote, vif->local, @@ -1886,7 +1883,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, IPPROTO_IPIP, iph->tos & INET_DSCP_MASK, vif->link); if (IS_ERR(rt)) - goto out_free; + return -1; encap = sizeof(struct iphdr); } else { rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, @@ -1894,7 +1891,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, IPPROTO_IPIP, iph->tos & INET_DSCP_MASK, vif->link); if (IS_ERR(rt)) - goto out_free; + return -1; } if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { @@ -1904,14 +1901,14 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, */ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); - goto out_free; + return -1; } encap += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; if (skb_cow(skb, encap)) { ip_rt_put(rt); - goto out_free; + return -1; } WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); @@ -1931,6 +1928,22 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); } + return 0; +} + +static void ipmr_queue_fwd_xmit(struct net *net, struct mr_table *mrt, + int in_vifi, struct sk_buff *skb, int vifi) +{ + struct rtable *rt; + + if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) + goto out_free; + + if (ipmr_prepare_xmit(net, mrt, skb, vifi)) + goto out_free; + + rt = skb_rtable(skb); + IPCB(skb)->flags |= IPSKB_FORWARDED; /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally @@ -2062,8 +2075,8 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, true_vifi, - skb2, psend); + ipmr_queue_fwd_xmit(net, mrt, true_vifi, + skb2, psend); } psend = ct; } @@ -2074,10 +2087,10 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, true_vifi, skb2, - psend); + ipmr_queue_fwd_xmit(net, mrt, true_vifi, skb2, + psend); } else { - ipmr_queue_xmit(net, mrt, true_vifi, skb, psend); + ipmr_queue_fwd_xmit(net, mrt, true_vifi, skb, psend); return; } } From 35bec72a24ace52a7f57642ff2813f22733b08fd Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:12 +0200 Subject: [PATCH 04/15] net: ipv4: Add ip_mr_output() Multicast routing is today handled in the input path. Locally generated MC packets don't hit the IPMR code today. Thus if a VXLAN remote address is multicast, the driver needs to set an OIF during route lookup. Thus MC routing configuration needs to be kept in sync with the VXLAN FDB and MDB. Ideally, the VXLAN packets would be routed by the MC routing code instead. To that end, this patch adds support to route locally generated multicast packets. The newly-added routines do largely what ip_mr_input() and ip_mr_forward() do: make an MR cache lookup to find where to send the packets, and use ip_mc_output() to send each of them. When no cache entry is found, the packet is punted to the daemon for resolution. However, an installation that uses a VXLAN underlay netdevice for which it also has matching MC routes, would get a different routing with this patch. Previously, the MC packets would be delivered directly to the underlay port, whereas now they would be MC-routed. In order to avoid this change in behavior, introduce an IPCB flag. Only if the flag is set will ip_mr_output() actually engage, otherwise it reverts to ip_mc_output(). This code is based on work by Roopa Prabhu and Nikolay Aleksandrov. Signed-off-by: Roopa Prabhu Signed-off-by: Nikolay Aleksandrov Signed-off-by: Benjamin Poirier Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/0aadbd49330471c0f758d54afb05eb3b6e3a6b65.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 2 + net/ipv4/ipmr.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/route.c | 2 +- 3 files changed, 120 insertions(+), 1 deletion(-) diff --git a/include/net/ip.h b/include/net/ip.h index 47ed6d23853d..375304bb99f6 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -59,6 +59,7 @@ struct inet_skb_parm { #define IPSKB_L3SLAVE BIT(7) #define IPSKB_NOPOLICY BIT(8) #define IPSKB_MULTIPATH BIT(9) +#define IPSKB_MCROUTE BIT(10) u16 frag_max_size; }; @@ -167,6 +168,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, int ip_local_deliver(struct sk_buff *skb); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto); int ip_mr_input(struct sk_buff *skb); +int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 74d45fd5d11e..f78c4e53dc8c 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1965,6 +1965,19 @@ static void ipmr_queue_fwd_xmit(struct net *net, struct mr_table *mrt, kfree_skb(skb); } +static void ipmr_queue_output_xmit(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) +{ + if (ipmr_prepare_xmit(net, mrt, skb, vifi)) + goto out_free; + + ip_mc_output(net, NULL, skb); + return; + +out_free: + kfree_skb(skb); +} + /* Called with mrt_lock or rcu_read_lock() */ static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev) { @@ -2224,6 +2237,110 @@ int ip_mr_input(struct sk_buff *skb) return 0; } +static void ip_mr_output_finish(struct net *net, struct mr_table *mrt, + struct net_device *dev, struct sk_buff *skb, + struct mfc_cache *c) +{ + int psend = -1; + int ct; + + atomic_long_inc(&c->_c.mfc_un.res.pkt); + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); + + /* Forward the frame */ + if (c->mfc_origin == htonl(INADDR_ANY) && + c->mfc_mcastgrp == htonl(INADDR_ANY)) { + if (ip_hdr(skb)->ttl > + c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { + /* It's an (*,*) entry and the packet is not coming from + * the upstream: forward the packet to the upstream + * only. + */ + psend = c->_c.mfc_parent; + goto last_xmit; + } + goto dont_xmit; + } + + for (ct = c->_c.mfc_un.res.maxvif - 1; + ct >= c->_c.mfc_un.res.minvif; ct--) { + if (ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) { + if (psend != -1) { + struct sk_buff *skb2; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ipmr_queue_output_xmit(net, mrt, + skb2, psend); + } + psend = ct; + } + } + +last_xmit: + if (psend != -1) { + ipmr_queue_output_xmit(net, mrt, skb, psend); + return; + } + +dont_xmit: + kfree_skb(skb); +} + +/* Multicast packets for forwarding arrive here + * Called with rcu_read_lock(); + */ +int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct rtable *rt = skb_rtable(skb); + struct mfc_cache *cache; + struct net_device *dev; + struct mr_table *mrt; + int vif; + + WARN_ON_ONCE(!rcu_read_lock_held()); + dev = rt->dst.dev; + + if (IPCB(skb)->flags & IPSKB_FORWARDED) + goto mc_output; + if (!(IPCB(skb)->flags & IPSKB_MCROUTE)) + goto mc_output; + + skb->dev = dev; + + mrt = ipmr_rt_fib_lookup(net, skb); + if (IS_ERR(mrt)) + goto mc_output; + + /* already under rcu_read_lock() */ + cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); + if (!cache) { + vif = ipmr_find_vif(mrt, dev); + if (vif >= 0) + cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, + vif); + } + + /* No usable cache entry */ + if (!cache) { + vif = ipmr_find_vif(mrt, dev); + if (vif >= 0) + return ipmr_cache_unresolved(mrt, vif, skb, dev); + goto mc_output; + } + + vif = cache->_c.mfc_parent; + if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) + goto mc_output; + + ip_mr_output_finish(net, mrt, dev, skb, cache); + return 0; + +mc_output: + return ip_mc_output(net, sk, skb); +} + #ifdef CONFIG_IP_PIMSM_V1 /* Handle IGMP messages of PIMv1 */ int pim_rcv_v1(struct sk_buff *skb) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fccb05fb3a79..3ddf6bf40357 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2660,7 +2660,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(fl4->daddr)) { rth->dst.input = ip_mr_input; - rth->dst.output = ip_mc_output; + rth->dst.output = ip_mr_output; } } #endif From 6a7d88ca15f73c5c570c372238f71d63da1fda55 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:13 +0200 Subject: [PATCH 05/15] net: ipv6: Make udp_tunnel6_xmit_skb() void The function always returns zero, thus the return value does not carry any signal. Just make it void. Most callers already ignore the return value. However: - Refold arguments of the call from sctp_v6_xmit() so that they fit into the 80-column limit. - tipc_udp_xmit() initializes err from the return value, but that should already be always zero at that point. So there's no practical change, but elision of the assignment prompts a couple more tweaks to clean up the function. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/7facacf9d8ca3ca9391a4aee88160913671b868d.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/udp_tunnel.h | 14 +++++++------- net/ipv6/ip6_udp_tunnel.c | 15 +++++++-------- net/sctp/ipv6.c | 7 ++++--- net/tipc/udp_media.c | 10 +++++----- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 28102c8fd8a8..0b01f6ade20d 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -152,13 +152,13 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb __be16 df, __be16 src_port, __be16 dst_port, bool xnet, bool nocheck, u16 ipcb_flags); -int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck); +void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck); void udp_tunnel_sock_release(struct socket *sock); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index c99053189ea8..21681718b7bb 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -74,13 +74,13 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, } EXPORT_SYMBOL_GPL(udp_sock_create6); -int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck) +void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck) { struct udphdr *uh; struct ipv6hdr *ip6h; @@ -109,7 +109,6 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, ip6h->saddr = *saddr; ip6tunnel_xmit(sk, skb, dev); - return 0; } EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index a9ed2ccab1bd..d1ecf7454827 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -261,9 +261,10 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) skb_set_inner_ipproto(skb, IPPROTO_SCTP); label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6); - return udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, - &fl6->daddr, tclass, ip6_dst_hoplimit(dst), - label, sctp_sk(sk)->udp_port, t->encap_port, false); + udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr, + tclass, ip6_dst_hoplimit(dst), label, + sctp_sk(sk)->udp_port, t->encap_port, false); + return 0; } /* Returns the dst cache entry for the given source and destination ip diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 87e8c1e6d550..414713fcd8c5 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -172,7 +172,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, struct udp_media_addr *dst, struct dst_cache *cache) { struct dst_entry *ndst; - int ttl, err = 0; + int ttl, err; local_bh_disable(); ndst = dst_cache_get(cache); @@ -217,13 +217,13 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, dst_cache_set_ip6(cache, ndst, &fl6.saddr); } ttl = ip6_dst_hoplimit(ndst); - err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, - &src->ipv6, &dst->ipv6, 0, ttl, 0, - src->port, dst->port, false); + udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, + &src->ipv6, &dst->ipv6, 0, ttl, 0, + src->port, dst->port, false); #endif } local_bh_enable(); - return err; + return 0; tx_error: local_bh_enable(); From f78c75d84fe83898f0a00658f593d4f17b38cbc6 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:14 +0200 Subject: [PATCH 06/15] net: ipv6: Add a flags argument to ip6tunnel_xmit(), udp_tunnel6_xmit_skb() ip6tunnel_xmit() erases the contents of the SKB control block. In order to be able to set particular IP6CB flags on the SKB, add a corresponding parameter, and propagate it to udp_tunnel6_xmit_skb() as well. In one of the following patches, VXLAN driver will use this facility to mark packets as subject to IPv6 multicast routing. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/acb4f9f3e40c3a931236c3af08a720b017fbfbfb.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/bareudp.c | 3 ++- drivers/net/geneve.c | 3 ++- drivers/net/gtp.c | 2 +- drivers/net/ovpn/udp.c | 2 +- drivers/net/vxlan/vxlan_core.c | 3 ++- drivers/net/wireguard/socket.c | 2 +- include/net/ip6_tunnel.h | 3 ++- include/net/udp_tunnel.h | 3 ++- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/ip6_udp_tunnel.c | 5 +++-- net/sctp/ipv6.c | 2 +- net/tipc/udp_media.c | 2 +- 12 files changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 5e613080d3f8..0df3208783ad 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -431,7 +431,8 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, &saddr, &daddr, prio, ttl, info->key.label, sport, bareudp->port, !test_bit(IP_TUNNEL_CSUM_BIT, - info->key.tun_flags)); + info->key.tun_flags), + 0); return 0; free_dst: diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index c668e8b00ed2..f6bd155aae7f 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1014,7 +1014,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, &saddr, &key->u.ipv6.dst, prio, ttl, info->key.label, sport, geneve->cfg.info.key.tp_dst, !test_bit(IP_TUNNEL_CSUM_BIT, - info->key.tun_flags)); + info->key.tun_flags), + 0); return 0; } #endif diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 14584793fe4e..4b668ebaa0f7 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1316,7 +1316,7 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) ip6_dst_hoplimit(&pktinfo.rt->dst), 0, pktinfo.gtph_port, pktinfo.gtph_port, - false); + false, 0); #else goto tx_err; #endif diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c index d866e6bfda70..254cc94c4617 100644 --- a/drivers/net/ovpn/udp.c +++ b/drivers/net/ovpn/udp.c @@ -274,7 +274,7 @@ static int ovpn_udp6_output(struct ovpn_peer *peer, struct ovpn_bind *bind, skb->ignore_df = 1; udp_tunnel6_xmit_skb(dst, sk, skb, skb->dev, &fl.saddr, &fl.daddr, 0, ip6_dst_hoplimit(dst), 0, fl.fl6_sport, - fl.fl6_dport, udp_get_no_check6_tx(sk)); + fl.fl6_dport, udp_get_no_check6_tx(sk), 0); ret = 0; err: local_bh_enable(); diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 1cc18acd242d..b22f9866be8e 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2586,7 +2586,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, &saddr, &pkey->u.ipv6.dst, tos, ttl, - pkey->label, src_port, dst_port, !udp_sum); + pkey->label, src_port, dst_port, !udp_sum, + 0); #endif } vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len); diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 88e685667bc0..253488f8c00f 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -151,7 +151,7 @@ static int send6(struct wg_device *wg, struct sk_buff *skb, skb->ignore_df = 1; udp_tunnel6_xmit_skb(dst, sock, skb, skb->dev, &fl.saddr, &fl.daddr, ds, ip6_dst_hoplimit(dst), 0, fl.fl6_sport, - fl.fl6_dport, false); + fl.fl6_dport, false, 0); goto out; err: diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 399592405c72..dd163495f353 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -152,11 +152,12 @@ int ip6_tnl_get_iflink(const struct net_device *dev); int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu); static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, - struct net_device *dev) + struct net_device *dev, u16 ip6cb_flags) { int pkt_len, err; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + IP6CB(skb)->flags = ip6cb_flags; pkt_len = skb->len - skb_inner_network_offset(skb); err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 0b01f6ade20d..e3c70b579095 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -158,7 +158,8 @@ void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck); + __be16 src_port, __be16 dst_port, bool nocheck, + u16 ip6cb_flags); void udp_tunnel_sock_release(struct socket *sock); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 894d3158a6f0..a885bb5c98ea 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1278,7 +1278,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; - ip6tunnel_xmit(NULL, skb, dev); + ip6tunnel_xmit(NULL, skb, dev, 0); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 21681718b7bb..8ebe17a6058a 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -80,7 +80,8 @@ void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck) + __be16 src_port, __be16 dst_port, bool nocheck, + u16 ip6cb_flags) { struct udphdr *uh; struct ipv6hdr *ip6h; @@ -108,7 +109,7 @@ void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, ip6h->daddr = *daddr; ip6h->saddr = *saddr; - ip6tunnel_xmit(sk, skb, dev); + ip6tunnel_xmit(sk, skb, dev, ip6cb_flags); } EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index d1ecf7454827..3336dcfb4515 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -263,7 +263,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr, tclass, ip6_dst_hoplimit(dst), label, - sctp_sk(sk)->udp_port, t->encap_port, false); + sctp_sk(sk)->udp_port, t->encap_port, false, 0); return 0; } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 414713fcd8c5..a024fcc8c0cb 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -219,7 +219,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, ttl = ip6_dst_hoplimit(ndst); udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, &src->ipv6, &dst->ipv6, 0, ttl, 0, - src->port, dst->port, false); + src->port, dst->port, false, 0); #endif } local_bh_enable(); From 3365afd3abda5f6a54f4a822dad5c9314e94c3fc Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:15 +0200 Subject: [PATCH 07/15] net: ipv6: ip6mr: Fix in/out netdev to pass to the FORWARD chain The netfilter hook is invoked with skb->dev for input netdevice, and vif_dev for output netdevice. However at the point of invocation, skb->dev is already set to vif_dev, and MR-forwarded packets are reported with in=out: # ip6tables -A FORWARD -j LOG --log-prefix '[forw]' # cd tools/testing/selftests/net/forwarding # ./router_multicast.sh # dmesg | fgrep '[forw]' [ 1670.248245] [forw]IN=v5 OUT=v5 [...] For reference, IPv4 MR code shows in and out as appropriate. Fix by caching skb->dev and using the updated value for output netdev. Fixes: 7bc570c8b4f7 ("[IPV6] MROUTE: Support multicast forwarding.") Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/3141ae8386fbe13fef4b793faa75e6bae58d798a.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6mr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 9db31e5b998c..426859cd3409 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2039,6 +2039,7 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { struct vif_device *vif = &mrt->vif_table[vifi]; + struct net_device *indev = skb->dev; struct net_device *vif_dev; struct ipv6hdr *ipv6h; struct dst_entry *dst; @@ -2101,7 +2102,7 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, IP6CB(skb)->flags |= IP6SKB_FORWARDED; return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, - net, NULL, skb, skb->dev, vif_dev, + net, NULL, skb, indev, skb->dev, ip6mr_forward2_finish); out_free: From 094f39d5e84d6c48bb38ded6b30f68b12cb8145b Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:16 +0200 Subject: [PATCH 08/15] net: ipv6: ip6mr: Make ip6mr_forward2() void Nobody uses the return value, so convert the function to void. Signed-off-by: Petr Machata Link: https://patch.msgid.link/e0bee259da0da58da96647ea9e21e6360c8f7e11.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6mr.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 426859cd3409..41c348209e1b 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2035,8 +2035,8 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct * Processing handlers for ip6mr_forward */ -static int ip6mr_forward2(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, int vifi) +static void ip6mr_forward2(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) { struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *indev = skb->dev; @@ -2101,13 +2101,13 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, IP6CB(skb)->flags |= IP6SKB_FORWARDED; - return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, - net, NULL, skb, indev, skb->dev, - ip6mr_forward2_finish); + NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, + net, NULL, skb, indev, skb->dev, + ip6mr_forward2_finish); + return; out_free: kfree_skb(skb); - return 0; } /* Called with rcu_read_lock() */ From 1b02f4475d29c6e2c4b7f1bae74149b9c1369791 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:17 +0200 Subject: [PATCH 09/15] net: ipv6: ip6mr: Split ip6mr_forward2() in two Some of the work of ip6mr_forward2() is specific to IPMR forwarding, and should not take place on the output path. In order to allow reuse of the common parts, extract out of the function a helper, ip6mr_prepare_forward(). Signed-off-by: Petr Machata Link: https://patch.msgid.link/8932bd5c0fbe3f662b158803b8509604fa7bc113.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6mr.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 41c348209e1b..bd964564160d 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2035,11 +2035,10 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct * Processing handlers for ip6mr_forward */ -static void ip6mr_forward2(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, int vifi) +static int ip6mr_prepare_xmit(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) { struct vif_device *vif = &mrt->vif_table[vifi]; - struct net_device *indev = skb->dev; struct net_device *vif_dev; struct ipv6hdr *ipv6h; struct dst_entry *dst; @@ -2047,7 +2046,7 @@ static void ip6mr_forward2(struct net *net, struct mr_table *mrt, vif_dev = vif_dev_read(vif); if (!vif_dev) - goto out_free; + return -1; #ifdef CONFIG_IPV6_PIMSM_V2 if (vif->flags & MIFF_REGISTER) { @@ -2056,7 +2055,7 @@ static void ip6mr_forward2(struct net *net, struct mr_table *mrt, DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); - goto out_free; + return -1; } #endif @@ -2070,7 +2069,7 @@ static void ip6mr_forward2(struct net *net, struct mr_table *mrt, dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); - goto out_free; + return -1; } skb_dst_drop(skb); @@ -2094,10 +2093,20 @@ static void ip6mr_forward2(struct net *net, struct mr_table *mrt, /* We are about to write */ /* XXX: extension headers? */ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev))) - goto out_free; + return -1; ipv6h = ipv6_hdr(skb); ipv6h->hop_limit--; + return 0; +} + +static void ip6mr_forward2(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) +{ + struct net_device *indev = skb->dev; + + if (ip6mr_prepare_xmit(net, mrt, skb, vifi)) + goto out_free; IP6CB(skb)->flags |= IP6SKB_FORWARDED; From 96e8f5a9fe2d91b9f9eb8b45cc13ce1ca6a8af82 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:18 +0200 Subject: [PATCH 10/15] net: ipv6: Add ip6_mr_output() Multicast routing is today handled in the input path. Locally generated MC packets don't hit the IPMR code today. Thus if a VXLAN remote address is multicast, the driver needs to set an OIF during route lookup. Thus MC routing configuration needs to be kept in sync with the VXLAN FDB and MDB. Ideally, the VXLAN packets would be routed by the MC routing code instead. To that end, this patch adds support to route locally generated multicast packets. The newly-added routines do largely what ip6_mr_input() and ip6_mr_forward() do: make an MR cache lookup to find where to send the packets, and use ip6_output() to send each of them. When no cache entry is found, the packet is punted to the daemon for resolution. Similarly to the IPv4 case in a previous patch, the new logic is contingent on a newly-added IP6CB flag being set. Signed-off-by: Petr Machata Link: https://patch.msgid.link/3bcc034a3ab4d3c291072fff38f78d7fbbeef4e6.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 1 + include/linux/mroute6.h | 7 +++ net/ipv6/ip6mr.c | 118 ++++++++++++++++++++++++++++++++++++++++ net/ipv6/route.c | 1 + 4 files changed, 127 insertions(+) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 5aeeed22f35b..db0eb0d86b64 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -156,6 +156,7 @@ struct inet6_skb_parm { #define IP6SKB_SEG6 256 #define IP6SKB_FAKEJUMBO 512 #define IP6SKB_MULTIPATH 1024 +#define IP6SKB_MCROUTE 2048 }; #if defined(CONFIG_NET_L3_MASTER_DEV) diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index 63ef5191cc57..fddafdc168f7 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -31,6 +31,7 @@ extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); extern int ip6_mr_input(struct sk_buff *skb); extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); extern int ip6_mr_init(void); +extern int ip6_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb); extern void ip6_mr_cleanup(void); int ip6mr_ioctl(struct sock *sk, int cmd, void *arg); #else @@ -58,6 +59,12 @@ static inline int ip6_mr_init(void) return 0; } +static inline int +ip6_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + return ip6_output(net, sk, skb); +} + static inline void ip6_mr_cleanup(void) { return; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index bd964564160d..a35f4f1c6589 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2119,6 +2119,19 @@ static void ip6mr_forward2(struct net *net, struct mr_table *mrt, kfree_skb(skb); } +static void ip6mr_output2(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) +{ + if (ip6mr_prepare_xmit(net, mrt, skb, vifi)) + goto out_free; + + ip6_output(net, NULL, skb); + return; + +out_free: + kfree_skb(skb); +} + /* Called with rcu_read_lock() */ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) { @@ -2231,6 +2244,56 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, kfree_skb(skb); } +/* Called under rcu_read_lock() */ +static void ip6_mr_output_finish(struct net *net, struct mr_table *mrt, + struct net_device *dev, struct sk_buff *skb, + struct mfc6_cache *c) +{ + int psend = -1; + int ct; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + atomic_long_inc(&c->_c.mfc_un.res.pkt); + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); + + /* Forward the frame */ + if (ipv6_addr_any(&c->mf6c_origin) && + ipv6_addr_any(&c->mf6c_mcastgrp)) { + if (ipv6_hdr(skb)->hop_limit > + c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { + /* It's an (*,*) entry and the packet is not coming from + * the upstream: forward the packet to the upstream + * only. + */ + psend = c->_c.mfc_parent; + goto last_forward; + } + goto dont_forward; + } + for (ct = c->_c.mfc_un.res.maxvif - 1; + ct >= c->_c.mfc_un.res.minvif; ct--) { + if (ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { + if (psend != -1) { + struct sk_buff *skb2; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ip6mr_output2(net, mrt, skb2, psend); + } + psend = ct; + } + } +last_forward: + if (psend != -1) { + ip6mr_output2(net, mrt, skb, psend); + return; + } + +dont_forward: + kfree_skb(skb); +} /* * Multicast packets for forwarding arrive here @@ -2298,6 +2361,61 @@ int ip6_mr_input(struct sk_buff *skb) return 0; } +int ip6_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct net_device *dev = skb_dst(skb)->dev; + struct flowi6 fl6 = (struct flowi6) { + .flowi6_iif = LOOPBACK_IFINDEX, + .flowi6_mark = skb->mark, + }; + struct mfc6_cache *cache; + struct mr_table *mrt; + int err; + int vif; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (IP6CB(skb)->flags & IP6SKB_FORWARDED) + goto ip6_output; + if (!(IP6CB(skb)->flags & IP6SKB_MCROUTE)) + goto ip6_output; + + err = ip6mr_fib_lookup(net, &fl6, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } + + cache = ip6mr_cache_find(mrt, + &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); + if (!cache) { + vif = ip6mr_find_vif(mrt, dev); + if (vif >= 0) + cache = ip6mr_cache_find_any(mrt, + &ipv6_hdr(skb)->daddr, + vif); + } + + /* No usable cache entry */ + if (!cache) { + vif = ip6mr_find_vif(mrt, dev); + if (vif >= 0) + return ip6mr_cache_unresolved(mrt, vif, skb, dev); + goto ip6_output; + } + + /* Wrong interface */ + vif = cache->_c.mfc_parent; + if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) + goto ip6_output; + + ip6_mr_output_finish(net, mrt, dev, skb, cache); + return 0; + +ip6_output: + return ip6_output(net, sk, skb); +} + int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 79c8f1acf8a3..df0caffefb38 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1145,6 +1145,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) rt->dst.input = ip6_input; } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; + rt->dst.output = ip6_mr_output; } else { rt->dst.input = ip6_forward; } From f8337efa4ff5a27e6c1d4e384166413eecd21a65 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:19 +0200 Subject: [PATCH 11/15] vxlan: Support MC routing in the underlay Locally-generated MC packets have so far not been subject to MC routing. Instead an MC-enabled installation would maintain the MC routing tables, and separately from that the list of interfaces to send packets to as part of the VXLAN FDB and MDB. In a previous patch, a ip_mr_output() and ip6_mr_output() routines were added for IPv4 and IPv6. All locally generated MC traffic is now passed through these functions. For reasons of backward compatibility, an SKB (IPCB / IP6CB) flag guards the actual MC routing. This patch adds logic to set the flag, and the UAPI to enable the behavior. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/d899655bb7e9b2521ee8c793e67056b9fd02ba12.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 22 ++++++++++++++++++++-- include/net/vxlan.h | 5 ++++- include/uapi/linux/if_link.h | 1 + 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index b22f9866be8e..a6cc1de4d8b8 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2451,6 +2451,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, rcu_read_lock(); if (addr_family == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); + u16 ipcb_flags = 0; struct rtable *rt; __be16 df = 0; __be32 saddr; @@ -2467,6 +2468,9 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto tx_error; } + if (flags & VXLAN_F_MC_ROUTE) + ipcb_flags |= IPSKB_MCROUTE; + if (!info) { /* Bypass encapsulation if the destination is local */ err = encap_bypass_if_local(skb, dev, vxlan, AF_INET, @@ -2522,11 +2526,13 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr, pkey->u.ipv4.dst, tos, ttl, df, - src_port, dst_port, xnet, !udp_sum, 0); + src_port, dst_port, xnet, !udp_sum, + ipcb_flags); #if IS_ENABLED(CONFIG_IPV6) } else { struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct in6_addr saddr; + u16 ip6cb_flags = 0; if (!ifindex) ifindex = sock6->sock->sk->sk_bound_dev_if; @@ -2542,6 +2548,9 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto tx_error; } + if (flags & VXLAN_F_MC_ROUTE) + ip6cb_flags |= IP6SKB_MCROUTE; + if (!info) { u32 rt6i_flags = dst_rt6_info(ndst)->rt6i_flags; @@ -2587,7 +2596,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, &saddr, &pkey->u.ipv6.dst, tos, ttl, pkey->label, src_port, dst_port, !udp_sum, - 0); + ip6cb_flags); #endif } vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len); @@ -3402,6 +3411,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX), [IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)), + [IFLA_VXLAN_MC_ROUTE] = NLA_POLICY_MAX(NLA_U8, 1), }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -4315,6 +4325,14 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], return err; } + if (data[IFLA_VXLAN_MC_ROUTE]) { + err = vxlan_nl2flag(conf, data, IFLA_VXLAN_MC_ROUTE, + VXLAN_F_MC_ROUTE, changelink, + true, extack); + if (err) + return err; + } + if (tb[IFLA_MTU]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], diff --git a/include/net/vxlan.h b/include/net/vxlan.h index e2f7ca045d3e..0ee50785f4f1 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -332,6 +332,7 @@ struct vxlan_dev { #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 #define VXLAN_F_LOCALBYPASS 0x80000 +#define VXLAN_F_MC_ROUTE 0x100000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -353,7 +354,9 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ VXLAN_F_VNIFILTER | \ - VXLAN_F_LOCALBYPASS) + VXLAN_F_LOCALBYPASS | \ + VXLAN_F_MC_ROUTE | \ + 0) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 3ad2d5d98034..873c285996fe 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1398,6 +1398,7 @@ enum { IFLA_VXLAN_LOCALBYPASS, IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ IFLA_VXLAN_RESERVED_BITS, + IFLA_VXLAN_MC_ROUTE, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) From 2a719b7bacc74be9f04147be01262b6289ad8dc5 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:20 +0200 Subject: [PATCH 12/15] selftests: forwarding: lib: Move smcrouted helpers here router_multicast.sh has several helpers for work with smcrouted. Extract them to lib.sh so that other selftests can use them as well. Convert the helpers to defer in the process, because that simplifies the interface quite a bit. Therefore have router_multicast.sh invoke defer_scopes_cleanup() in its cleanup() function. Signed-off-by: Petr Machata Link: https://patch.msgid.link/410411c1a81225ce6e44542289b9c3ec21e5786c.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 33 +++++++++++++++++ .../net/forwarding/router_multicast.sh | 35 ++++--------------- 2 files changed, 39 insertions(+), 29 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 508f3c700d71..253847372062 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -37,6 +37,7 @@ declare -A NETIFS=( : "${TEAMD:=teamd}" : "${MCD:=smcrouted}" : "${MC_CLI:=smcroutectl}" +: "${MCD_TABLE_NAME:=selftests}" # Constants for netdevice bring-up: # Default time in seconds to wait for an interface to come up before giving up @@ -1757,6 +1758,38 @@ mc_send() msend -g $groups -I $if_name -c 1 > /dev/null 2>&1 } +adf_mcd_start() +{ + local table_name="$MCD_TABLE_NAME" + local smcroutedir + local pid + local i + + check_command "$MCD" || return 1 + check_command "$MC_CLI" || return 1 + + smcroutedir=$(mktemp -d) + defer rm -rf "$smcroutedir" + + for ((i = 1; i <= NUM_NETIFS; ++i)); do + echo "phyint ${NETIFS[p$i]} enable" >> \ + "$smcroutedir/$table_name.conf" + done + + "$MCD" -N -I "$table_name" -f "$smcroutedir/$table_name.conf" \ + -P "$smcroutedir/$table_name.pid" + busywait "$BUSYWAIT_TIMEOUT" test -e "$smcroutedir/$table_name.pid" + pid=$(cat "$smcroutedir/$table_name.pid") + defer kill_process "$pid" +} + +mc_cli() +{ + local table_name="$MCD_TABLE_NAME" + + "$MC_CLI" -I "$table_name" "$@" +} + start_ip_monitor() { local mtype=$1; shift diff --git a/tools/testing/selftests/net/forwarding/router_multicast.sh b/tools/testing/selftests/net/forwarding/router_multicast.sh index 5a58b1ec8aef..83e52abdbc2e 100755 --- a/tools/testing/selftests/net/forwarding/router_multicast.sh +++ b/tools/testing/selftests/net/forwarding/router_multicast.sh @@ -33,10 +33,6 @@ NUM_NETIFS=6 source lib.sh source tc_common.sh -require_command $MCD -require_command $MC_CLI -table_name=selftests - h1_create() { simple_if_init $h1 198.51.100.2/28 2001:db8:1::2/64 @@ -149,25 +145,6 @@ router_destroy() ip link set dev $rp1 down } -start_mcd() -{ - SMCROUTEDIR="$(mktemp -d)" - - for ((i = 1; i <= $NUM_NETIFS; ++i)); do - echo "phyint ${NETIFS[p$i]} enable" >> \ - $SMCROUTEDIR/$table_name.conf - done - - $MCD -N -I $table_name -f $SMCROUTEDIR/$table_name.conf \ - -P $SMCROUTEDIR/$table_name.pid -} - -kill_mcd() -{ - pkill $MCD - rm -rf $SMCROUTEDIR -} - setup_prepare() { h1=${NETIFS[p1]} @@ -179,7 +156,7 @@ setup_prepare() rp3=${NETIFS[p5]} h3=${NETIFS[p6]} - start_mcd + adf_mcd_start || exit "$EXIT_STATUS" vrf_prepare @@ -206,7 +183,7 @@ cleanup() vrf_cleanup - kill_mcd + defer_scopes_cleanup } create_mcast_sg() @@ -214,9 +191,9 @@ create_mcast_sg() local if_name=$1; shift local s_addr=$1; shift local mcast=$1; shift - local dest_ifs=${@} + local dest_ifs=("${@}") - $MC_CLI -I $table_name add $if_name $s_addr $mcast $dest_ifs + mc_cli add "$if_name" "$s_addr" "$mcast" "${dest_ifs[@]}" } delete_mcast_sg() @@ -224,9 +201,9 @@ delete_mcast_sg() local if_name=$1; shift local s_addr=$1; shift local mcast=$1; shift - local dest_ifs=${@} + local dest_ifs=("${@}") - $MC_CLI -I $table_name remove $if_name $s_addr $mcast $dest_ifs + mc_cli remove "$if_name" "$s_addr" "$mcast" "${dest_ifs[@]}" } mcast_v4() From 4baa1d3a5080c18a079d3688fa9726973959ee20 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:21 +0200 Subject: [PATCH 13/15] selftests: net: lib: Add ip_link_has_flag() Add a helper to determine whether a given netdevice has a given flag. Rewrite ip_link_is_up() in terms of the new helper. Signed-off-by: Petr Machata Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/e1eb174a411f9d24735d095984c731d1d4a5a592.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 006fdadcc4b9..ff0dbe23e8e0 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -547,13 +547,19 @@ ip_link_set_addr() defer ip link set dev "$name" address "$old_addr" } -ip_link_is_up() +ip_link_has_flag() { local name=$1; shift + local flag=$1; shift local state=$(ip -j link show "$name" | - jq -r '(.[].flags[] | select(. == "UP")) // "DOWN"') - [[ $state == "UP" ]] + jq --arg flag "$flag" 'any(.[].flags.[]; . == $flag)') + [[ $state == true ]] +} + +ip_link_is_up() +{ + ip_link_has_flag "$1" UP } ip_link_set_up() From 237f84a6d24a413b3a0795079afeb903ab1dec8a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:22 +0200 Subject: [PATCH 14/15] selftests: forwarding: adf_mcd_start(): Allow configuring custom interfaces Tests may wish to add other interfaces to listen on. Notably locally generated traffic uses dummy interfaces. The multicast daemon needs to know about these so that it allows forming rules that involve these interfaces, and so that net.ipv4.conf.X.mc_forwarding is set for the interfaces. To that end, allow passing in a list of interfaces to configure in addition to all the physical ones. Signed-off-by: Petr Machata Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/2e8d83297985933be4850f2b9f296b3c27110388.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 253847372062..83ee6a07e072 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -1760,9 +1760,12 @@ mc_send() adf_mcd_start() { + local ifs=("$@") + local table_name="$MCD_TABLE_NAME" local smcroutedir local pid + local if local i check_command "$MCD" || return 1 @@ -1776,6 +1779,16 @@ adf_mcd_start() "$smcroutedir/$table_name.conf" done + for if in "${ifs[@]}"; do + if ! ip_link_has_flag "$if" MULTICAST; then + ip link set dev "$if" multicast on + defer ip link set dev "$if" multicast off + fi + + echo "phyint $if enable" >> \ + "$smcroutedir/$table_name.conf" + done + "$MCD" -N -I "$table_name" -f "$smcroutedir/$table_name.conf" \ -P "$smcroutedir/$table_name.pid" busywait "$BUSYWAIT_TIMEOUT" test -e "$smcroutedir/$table_name.pid" From e3180379e2df535ac492c24ecc9719bf83b7a0f9 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:23 +0200 Subject: [PATCH 15/15] selftests: forwarding: Add a test for verifying VXLAN MC underlay Add tests for MC-routing underlay VXLAN traffic. Signed-off-by: Petr Machata Link: https://patch.msgid.link/eecd2c0fefc754182e74be8e8e65751bf5749c21.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/forwarding/Makefile | 1 + .../net/forwarding/vxlan_bridge_1q_mc_ul.sh | 771 ++++++++++++++++++ 2 files changed, 772 insertions(+) create mode 100755 tools/testing/selftests/net/forwarding/vxlan_bridge_1q_mc_ul.sh diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 00bde7b6f39e..d7bb2e80e88c 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -102,6 +102,7 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ vxlan_bridge_1d_port_8472.sh \ vxlan_bridge_1d.sh \ vxlan_bridge_1q_ipv6.sh \ + vxlan_bridge_1q_mc_ul.sh \ vxlan_bridge_1q_port_8472_ipv6.sh \ vxlan_bridge_1q_port_8472.sh \ vxlan_bridge_1q.sh \ diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_mc_ul.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_mc_ul.sh new file mode 100755 index 000000000000..7ec58b6b1128 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_mc_ul.sh @@ -0,0 +1,771 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# +-----------------------------------------+ +# | + $h1.10 + $h1.20 | +# | | 192.0.2.1/28 | 2001:db8:1::1/64 | +# | \________ ________/ | +# | \ / | +# | + $h1 H1 (vrf) | +# +-----------|-----------------------------+ +# | +# +-----------|----------------------------------------------------------------+ +# | +---------|--------------------------------------+ SWITCH (main vrf) | +# | | + $swp1 BR1 (802.1q) | | +# | | vid 10 20 | | +# | | | | +# | | + vx10 (vxlan) + vx20 (vxlan) | + lo10 (dummy) | +# | | local 192.0.2.100 local 2001:db8:4::1 | 192.0.2.100/28 | +# | | group 233.252.0.1 group ff0e::1:2:3 | 2001:db8:4::1/64 | +# | | id 1000 id 2000 | | +# | | vid 10 pvid untagged vid 20 pvid untagged | | +# | +------------------------------------------------+ | +# | | +# | + $swp2 $swp3 + | +# | | 192.0.2.33/28 192.0.2.65/28 | | +# | | 2001:db8:2::1/64 2001:db8:3::1/64 | | +# | | | | +# +---|--------------------------------------------------------------------|---+ +# | | +# +---|--------------------------------+ +--------------------------------|---+ +# | | H2 (vrf) | | H3 (vrf) | | +# | +-|----------------------------+ | | +-----------------------------|-+ | +# | | + $h2 BR2 (802.1d) | | | | BR3 (802.1d) $h3 + | | +# | | | | | | | | +# | | + v1$h2 (veth) | | | | v1$h3 (veth) + | | +# | +-|----------------------------+ | | +-----------------------------|-+ | +# | | | | | | +# +---|--------------------------------+ +--------------------------------|---+ +# | | +# +---|--------------------------------+ +--------------------------------|---+ +# | + v2$h2 (veth) NS2 (netns) | | NS3 (netns) v2$h3 (veth) + | +# | 192.0.2.34/28 | | 192.0.2.66/28 | +# | 2001:db8:2::2/64 | | 2001:db8:3::2/64 | +# | | | | +# | +--------------------------------+ | | +--------------------------------+ | +# | | BR1 (802.1q) | | | | BR1 (802.1q) | | +# | | + vx10 (vxlan) | | | | + vx10 (vxlan) | | +# | | local 192.0.2.34 | | | | local 192.0.2.50 | | +# | | group 233.252.0.1 dev v2$h2 | | | | group 233.252.0.1 dev v2$h3 | | +# | | id 1000 dstport $VXPORT | | | | id 1000 dstport $VXPORT | | +# | | vid 10 pvid untagged | | | | vid 10 pvid untagged | | +# | | | | | | | | +# | | + vx20 (vxlan) | | | | + vx20 (vxlan) | | +# | | local 2001:db8:2::2 | | | | local 2001:db8:3::2 | | +# | | group ff0e::1:2:3 dev v2$h2 | | | | group ff0e::1:2:3 dev v2$h3 | | +# | | id 2000 dstport $VXPORT | | | | id 2000 dstport $VXPORT | | +# | | vid 20 pvid untagged | | | | vid 20 pvid untagged | | +# | | | | | | | | +# | | + w1 (veth) | | | | + w1 (veth) | | +# | | | vid 10 20 | | | | | vid 10 20 | | +# | +--|-----------------------------+ | | +--|-----------------------------+ | +# | | | | | | +# | +--|-----------------------------+ | | +--|-----------------------------+ | +# | | + w2 (veth) VW2 (vrf) | | | | + w2 (veth) VW2 (vrf) | | +# | | |\ | | | | |\ | | +# | | | + w2.10 | | | | | + w2.10 | | +# | | | 192.0.2.3/28 | | | | | 192.0.2.4/28 | | +# | | | | | | | | | | +# | | + w2.20 | | | | + w2.20 | | +# | | 2001:db8:1::3/64 | | | | 2001:db8:1::4/64 | | +# | +--------------------------------+ | | +--------------------------------+ | +# +------------------------------------+ +------------------------------------+ +# +#shellcheck disable=SC2317 # SC doesn't see our uses of functions. + +: "${VXPORT:=4789}" +export VXPORT + +: "${GROUP4:=233.252.0.1}" +export GROUP4 + +: "${GROUP6:=ff0e::1:2:3}" +export GROUP6 + +: "${IPMR:=lo10}" + +ALL_TESTS=" + ipv4_nomcroute + ipv4_mcroute + ipv4_mcroute_changelink + ipv4_mcroute_starg + ipv4_mcroute_noroute + ipv4_mcroute_fdb + ipv4_mcroute_fdb_oif0 + ipv4_mcroute_fdb_oif0_sep + + ipv6_nomcroute + ipv6_mcroute + ipv6_mcroute_changelink + ipv6_mcroute_starg + ipv6_mcroute_noroute + ipv6_mcroute_fdb + ipv6_mcroute_fdb_oif0 + + ipv4_nomcroute_rx + ipv4_mcroute_rx + ipv4_mcroute_starg_rx + ipv4_mcroute_fdb_oif0_sep_rx + ipv4_mcroute_fdb_sep_rx + + ipv6_nomcroute_rx + ipv6_mcroute_rx + ipv6_mcroute_starg_rx + ipv6_mcroute_fdb_sep_rx +" + +NUM_NETIFS=6 +source lib.sh + +h1_create() +{ + simple_if_init "$h1" + defer simple_if_fini "$h1" + + ip_link_add "$h1.10" master "v$h1" link "$h1" type vlan id 10 + ip_link_set_up "$h1.10" + ip_addr_add "$h1.10" 192.0.2.1/28 + + ip_link_add "$h1.20" master "v$h1" link "$h1" type vlan id 20 + ip_link_set_up "$h1.20" + ip_addr_add "$h1.20" 2001:db8:1::1/64 +} + +install_capture() +{ + local dev=$1; shift + + tc qdisc add dev "$dev" clsact + defer tc qdisc del dev "$dev" clsact + + tc filter add dev "$dev" ingress proto ip pref 104 \ + flower skip_hw ip_proto udp dst_port "$VXPORT" \ + action pass + defer tc filter del dev "$dev" ingress proto ip pref 104 + + tc filter add dev "$dev" ingress proto ipv6 pref 106 \ + flower skip_hw ip_proto udp dst_port "$VXPORT" \ + action pass + defer tc filter del dev "$dev" ingress proto ipv6 pref 106 +} + +h2_create() +{ + # $h2 + ip_link_set_up "$h2" + + # H2 + vrf_create "v$h2" + defer vrf_destroy "v$h2" + + ip_link_set_up "v$h2" + + # br2 + ip_link_add br2 type bridge vlan_filtering 0 mcast_snooping 0 + ip_link_set_master br2 "v$h2" + ip_link_set_up br2 + + # $h2 + ip_link_set_master "$h2" br2 + install_capture "$h2" + + # v1$h2 + ip_link_set_up "v1$h2" + ip_link_set_master "v1$h2" br2 +} + +h3_create() +{ + # $h3 + ip_link_set_up "$h3" + + # H3 + vrf_create "v$h3" + defer vrf_destroy "v$h3" + + ip_link_set_up "v$h3" + + # br3 + ip_link_add br3 type bridge vlan_filtering 0 mcast_snooping 0 + ip_link_set_master br3 "v$h3" + ip_link_set_up br3 + + # $h3 + ip_link_set_master "$h3" br3 + install_capture "$h3" + + # v1$h3 + ip_link_set_up "v1$h3" + ip_link_set_master "v1$h3" br3 +} + +switch_create() +{ + local swp1_mac + + # br1 + swp1_mac=$(mac_get "$swp1") + ip_link_add br1 type bridge vlan_filtering 1 \ + vlan_default_pvid 0 mcast_snooping 0 + ip_link_set_addr br1 "$swp1_mac" + ip_link_set_up br1 + + # A dummy to force the IPv6 OIF=0 test to install a suitable MC route on + # $IPMR to be deterministic. Also used for the IPv6 RX!=TX ping test. + ip_link_add "X$IPMR" up type dummy + + # IPMR + ip_link_add "$IPMR" up type dummy + ip_addr_add "$IPMR" 192.0.2.100/28 + ip_addr_add "$IPMR" 2001:db8:4::1/64 + + # $swp1 + ip_link_set_up "$swp1" + ip_link_set_master "$swp1" br1 + bridge_vlan_add vid 10 dev "$swp1" + bridge_vlan_add vid 20 dev "$swp1" + + # $swp2 + ip_link_set_up "$swp2" + ip_addr_add "$swp2" 192.0.2.33/28 + ip_addr_add "$swp2" 2001:db8:2::1/64 + + # $swp3 + ip_link_set_up "$swp3" + ip_addr_add "$swp3" 192.0.2.65/28 + ip_addr_add "$swp3" 2001:db8:3::1/64 +} + +vx_create() +{ + local name=$1; shift + local vid=$1; shift + + ip_link_add "$name" up type vxlan dstport "$VXPORT" \ + nolearning noudpcsum tos inherit ttl 16 \ + "$@" + ip_link_set_master "$name" br1 + bridge_vlan_add vid "$vid" dev "$name" pvid untagged +} +export -f vx_create + +vx_wait() +{ + # Wait for all the ARP, IGMP etc. noise to settle down so that the + # tunnel is clear for measurements. + sleep 10 +} + +vx10_create() +{ + vx_create vx10 10 id 1000 "$@" +} +export -f vx10_create + +vx20_create() +{ + vx_create vx20 20 id 2000 "$@" +} +export -f vx20_create + +vx10_create_wait() +{ + vx10_create "$@" + vx_wait +} + +vx20_create_wait() +{ + vx20_create "$@" + vx_wait +} + +ns_init_common() +{ + local ns=$1; shift + local if_in=$1; shift + local ipv4_in=$1; shift + local ipv6_in=$1; shift + local ipv4_host=$1; shift + local ipv6_host=$1; shift + + # v2$h2 / v2$h3 + ip_link_set_up "$if_in" + ip_addr_add "$if_in" "$ipv4_in" + ip_addr_add "$if_in" "$ipv6_in" + + # br1 + ip_link_add br1 type bridge vlan_filtering 1 \ + vlan_default_pvid 0 mcast_snooping 0 + ip_link_set_up br1 + + # vx10, vx20 + vx10_create local "${ipv4_in%/*}" group "$GROUP4" dev "$if_in" + vx20_create local "${ipv6_in%/*}" group "$GROUP6" dev "$if_in" + + # w1 + ip_link_add w1 type veth peer name w2 + ip_link_set_master w1 br1 + ip_link_set_up w1 + bridge_vlan_add vid 10 dev w1 + bridge_vlan_add vid 20 dev w1 + + # w2 + simple_if_init w2 + defer simple_if_fini w2 + + # w2.10 + ip_link_add w2.10 master vw2 link w2 type vlan id 10 + ip_link_set_up w2.10 + ip_addr_add w2.10 "$ipv4_host" + + # w2.20 + ip_link_add w2.20 master vw2 link w2 type vlan id 20 + ip_link_set_up w2.20 + ip_addr_add w2.20 "$ipv6_host" +} +export -f ns_init_common + +ns2_create() +{ + # NS2 + ip netns add ns2 + defer ip netns del ns2 + + # v2$h2 + ip link set dev "v2$h2" netns ns2 + defer ip -n ns2 link set dev "v2$h2" netns 1 + + in_ns ns2 \ + ns_init_common ns2 "v2$h2" \ + 192.0.2.34/28 2001:db8:2::2/64 \ + 192.0.2.3/28 2001:db8:1::3/64 +} + +ns3_create() +{ + # NS3 + ip netns add ns3 + defer ip netns del ns3 + + # v2$h3 + ip link set dev "v2$h3" netns ns3 + defer ip -n ns3 link set dev "v2$h3" netns 1 + + ip -n ns3 link set dev "v2$h3" up + + in_ns ns3 \ + ns_init_common ns3 "v2$h3" \ + 192.0.2.66/28 2001:db8:3::2/64 \ + 192.0.2.4/28 2001:db8:1::4/64 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + swp3=${NETIFS[p5]} + h3=${NETIFS[p6]} + + vrf_prepare + defer vrf_cleanup + + forwarding_enable + defer forwarding_restore + + ip_link_add "v1$h2" type veth peer name "v2$h2" + ip_link_add "v1$h3" type veth peer name "v2$h3" + + h1_create + h2_create + h3_create + switch_create + ns2_create + ns3_create +} + +adf_install_broken_sg() +{ + adf_mcd_start "$IPMR" || exit "$EXIT_STATUS" + + mc_cli add "$swp2" 192.0.2.100 "$GROUP4" "$swp1" "$swp3" + defer mc_cli remove "$swp2" 192.0.2.100 "$GROUP4" "$swp1" "$swp3" + + mc_cli add "$swp2" 2001:db8:4::1 "$GROUP6" "$swp1" "$swp3" + defer mc_cli remove "$swp2" 2001:db8:4::1 "$GROUP6" "$swp1" "$swp3" +} + +adf_install_rx() +{ + mc_cli add "$swp2" 0.0.0.0 "$GROUP4" "$IPMR" + defer mc_cli remove "$swp2" 0.0.0.0 "$GROUP4" lo10 + + mc_cli add "$swp3" 0.0.0.0 "$GROUP4" "$IPMR" + defer mc_cli remove "$swp3" 0.0.0.0 "$GROUP4" lo10 + + mc_cli add "$swp2" :: "$GROUP6" "$IPMR" + defer mc_cli remove "$swp2" :: "$GROUP6" lo10 + + mc_cli add "$swp3" :: "$GROUP6" "$IPMR" + defer mc_cli remove "$swp3" :: "$GROUP6" lo10 +} + +adf_install_sg() +{ + adf_mcd_start "$IPMR" || exit "$EXIT_STATUS" + + mc_cli add "$IPMR" 192.0.2.100 "$GROUP4" "$swp2" "$swp3" + defer mc_cli remove "$IPMR" 192.0.2.33 "$GROUP4" "$swp2" "$swp3" + + mc_cli add "$IPMR" 2001:db8:4::1 "$GROUP6" "$swp2" "$swp3" + defer mc_cli remove "$IPMR" 2001:db8:4::1 "$GROUP6" "$swp2" "$swp3" + + adf_install_rx +} + +adf_install_sg_sep() +{ + adf_mcd_start lo || exit "$EXIT_STATUS" + + mc_cli add lo 192.0.2.120 "$GROUP4" "$swp2" "$swp3" + defer mc_cli remove lo 192.0.2.120 "$GROUP4" "$swp2" "$swp3" + + mc_cli add lo 2001:db8:5::1 "$GROUP6" "$swp2" "$swp3" + defer mc_cli remove lo 2001:db8:5::1 "$GROUP6" "$swp2" "$swp3" +} + +adf_install_sg_sep_rx() +{ + local lo=$1; shift + + adf_mcd_start "$IPMR" "$lo" || exit "$EXIT_STATUS" + + mc_cli add "$lo" 192.0.2.120 "$GROUP4" "$swp2" "$swp3" + defer mc_cli remove "$lo" 192.0.2.120 "$GROUP4" "$swp2" "$swp3" + + mc_cli add "$lo" 2001:db8:5::1 "$GROUP6" "$swp2" "$swp3" + defer mc_cli remove "$lo" 2001:db8:5::1 "$GROUP6" "$swp2" "$swp3" + + adf_install_rx +} + +adf_install_starg() +{ + adf_mcd_start "$IPMR" || exit "$EXIT_STATUS" + + mc_cli add "$IPMR" 0.0.0.0 "$GROUP4" "$swp2" "$swp3" + defer mc_cli remove "$IPMR" 0.0.0.0 "$GROUP4" "$swp2" "$swp3" + + mc_cli add "$IPMR" :: "$GROUP6" "$swp2" "$swp3" + defer mc_cli remove "$IPMR" :: "$GROUP6" "$swp2" "$swp3" + + adf_install_rx +} + +do_packets_v4() +{ + local mac + + mac=$(mac_get "$h2") + "$MZ" "$h1" -Q 10 -c 10 -d 100msec -p 64 -a own -b "$mac" \ + -A 192.0.2.1 -B 192.0.2.2 -t udp sp=1234,dp=2345 -q +} + +do_packets_v6() +{ + local mac + + mac=$(mac_get "$h2") + "$MZ" -6 "$h1" -Q 20 -c 10 -d 100msec -p 64 -a own -b "$mac" \ + -A 2001:db8:1::1 -B 2001:db8:1::2 -t udp sp=1234,dp=2345 -q +} + +do_test() +{ + local ipv=$1; shift + local expect_h2=$1; shift + local expect_h3=$1; shift + local what=$1; shift + + local pref=$((100 + ipv)) + local t0_h2 + local t0_h3 + local t1_h2 + local t1_h3 + local d_h2 + local d_h3 + + RET=0 + + t0_h2=$(tc_rule_stats_get "$h2" "$pref" ingress) + t0_h3=$(tc_rule_stats_get "$h3" "$pref" ingress) + + "do_packets_v$ipv" + sleep 1 + + t1_h2=$(tc_rule_stats_get "$h2" "$pref" ingress) + t1_h3=$(tc_rule_stats_get "$h3" "$pref" ingress) + + d_h2=$((t1_h2 - t0_h2)) + d_h3=$((t1_h3 - t0_h3)) + + ((d_h2 == expect_h2)) + check_err $? "Expected $expect_h2 packets on H2, got $d_h2" + + ((d_h3 == expect_h3)) + check_err $? "Expected $expect_h3 packets on H3, got $d_h3" + + log_test "VXLAN MC flood $what" +} + +ipv4_do_test_rx() +{ + local h3_should_fail=$1; shift + local what=$1; shift + + RET=0 + + ping_do "$h1.10" 192.0.2.3 + check_err $? "H2 should respond" + + ping_do "$h1.10" 192.0.2.4 + check_err_fail "$h3_should_fail" $? "H3 responds" + + log_test "VXLAN MC flood $what" +} + +ipv6_do_test_rx() +{ + local h3_should_fail=$1; shift + local what=$1; shift + + RET=0 + + ping6_do "$h1.20" 2001:db8:1::3 + check_err $? "H2 should respond" + + ping6_do "$h1.20" 2001:db8:1::4 + check_err_fail "$h3_should_fail" $? "H3 responds" + + log_test "VXLAN MC flood $what" +} + +ipv4_nomcroute() +{ + # Install a misleading (S,G) rule to attempt to trick the system into + # pushing the packets elsewhere. + adf_install_broken_sg + vx10_create_wait local 192.0.2.100 group "$GROUP4" dev "$swp2" + do_test 4 10 0 "IPv4 nomcroute" +} + +ipv6_nomcroute() +{ + # Like for IPv4, install a misleading (S,G). + adf_install_broken_sg + vx20_create_wait local 2001:db8:4::1 group "$GROUP6" dev "$swp2" + do_test 6 10 0 "IPv6 nomcroute" +} + +ipv4_nomcroute_rx() +{ + vx10_create local 192.0.2.100 group "$GROUP4" dev "$swp2" + ipv4_do_test_rx 1 "IPv4 nomcroute ping" +} + +ipv6_nomcroute_rx() +{ + vx20_create local 2001:db8:4::1 group "$GROUP6" dev "$swp2" + ipv6_do_test_rx 1 "IPv6 nomcroute ping" +} + +ipv4_mcroute() +{ + adf_install_sg + vx10_create_wait local 192.0.2.100 group "$GROUP4" dev "$IPMR" mcroute + do_test 4 10 10 "IPv4 mcroute" +} + +ipv6_mcroute() +{ + adf_install_sg + vx20_create_wait local 2001:db8:4::1 group "$GROUP6" dev "$IPMR" mcroute + do_test 6 10 10 "IPv6 mcroute" +} + +ipv4_mcroute_rx() +{ + adf_install_sg + vx10_create_wait local 192.0.2.100 group "$GROUP4" dev "$IPMR" mcroute + ipv4_do_test_rx 0 "IPv4 mcroute ping" +} + +ipv6_mcroute_rx() +{ + adf_install_sg + vx20_create_wait local 2001:db8:4::1 group "$GROUP6" dev "$IPMR" mcroute + ipv6_do_test_rx 0 "IPv6 mcroute ping" +} + +ipv4_mcroute_changelink() +{ + adf_install_sg + vx10_create_wait local 192.0.2.100 group "$GROUP4" dev "$IPMR" + ip link set dev vx10 type vxlan mcroute + sleep 1 + do_test 4 10 10 "IPv4 mcroute changelink" +} + +ipv6_mcroute_changelink() +{ + adf_install_sg + vx20_create_wait local 2001:db8:4::1 group "$GROUP6" dev "$IPMR" mcroute + ip link set dev vx20 type vxlan mcroute + sleep 1 + do_test 6 10 10 "IPv6 mcroute changelink" +} + +ipv4_mcroute_starg() +{ + adf_install_starg + vx10_create_wait local 192.0.2.100 group "$GROUP4" dev "$IPMR" mcroute + do_test 4 10 10 "IPv4 mcroute (*,G)" +} + +ipv6_mcroute_starg() +{ + adf_install_starg + vx20_create_wait local 2001:db8:4::1 group "$GROUP6" dev "$IPMR" mcroute + do_test 6 10 10 "IPv6 mcroute (*,G)" +} + +ipv4_mcroute_starg_rx() +{ + adf_install_starg + vx10_create_wait local 192.0.2.100 group "$GROUP4" dev "$IPMR" mcroute + ipv4_do_test_rx 0 "IPv4 mcroute (*,G) ping" +} + +ipv6_mcroute_starg_rx() +{ + adf_install_starg + vx20_create_wait local 2001:db8:4::1 group "$GROUP6" dev "$IPMR" mcroute + ipv6_do_test_rx 0 "IPv6 mcroute (*,G) ping" +} + +ipv4_mcroute_noroute() +{ + vx10_create_wait local 192.0.2.100 group "$GROUP4" dev "$IPMR" mcroute + do_test 4 0 0 "IPv4 mcroute, no route" +} + +ipv6_mcroute_noroute() +{ + vx20_create_wait local 2001:db8:4::1 group "$GROUP6" dev "$IPMR" mcroute + do_test 6 0 0 "IPv6 mcroute, no route" +} + +ipv4_mcroute_fdb() +{ + adf_install_sg + vx10_create_wait local 192.0.2.100 dev "$IPMR" mcroute + bridge fdb add dev vx10 \ + 00:00:00:00:00:00 self static dst "$GROUP4" via "$IPMR" + do_test 4 10 10 "IPv4 mcroute FDB" +} + +ipv6_mcroute_fdb() +{ + adf_install_sg + vx20_create_wait local 2001:db8:4::1 dev "$IPMR" mcroute + bridge -6 fdb add dev vx20 \ + 00:00:00:00:00:00 self static dst "$GROUP6" via "$IPMR" + do_test 6 10 10 "IPv6 mcroute FDB" +} + +# Use FDB to configure VXLAN in a way where oif=0 for purposes of FIB lookup. +ipv4_mcroute_fdb_oif0() +{ + adf_install_sg + vx10_create_wait local 192.0.2.100 group "$GROUP4" dev "$IPMR" mcroute + bridge fdb del dev vx10 00:00:00:00:00:00 + bridge fdb add dev vx10 00:00:00:00:00:00 self static dst "$GROUP4" + do_test 4 10 10 "IPv4 mcroute oif=0" +} + +ipv6_mcroute_fdb_oif0() +{ + # The IPv6 tunnel lookup does not fall back to selection by source + # address. Instead it just does a FIB match, and that would find one of + # the several ff00::/8 multicast routes -- each device has one. In order + # to reliably force the $IPMR device, add a /128 route for the + # destination group address. + ip -6 route add table local multicast "$GROUP6/128" dev "$IPMR" + defer ip -6 route del table local multicast "$GROUP6/128" dev "$IPMR" + + adf_install_sg + vx20_create_wait local 2001:db8:4::1 group "$GROUP6" dev "$IPMR" mcroute + bridge -6 fdb del dev vx20 00:00:00:00:00:00 + bridge -6 fdb add dev vx20 00:00:00:00:00:00 self static dst "$GROUP6" + do_test 6 10 10 "IPv6 mcroute oif=0" +} + +# In oif=0 test as above, have FIB lookup resolve to loopback instead of IPMR. +# This doesn't work with IPv6 -- a MC route on lo would be marked as RTF_REJECT. +ipv4_mcroute_fdb_oif0_sep() +{ + adf_install_sg_sep + + ip_addr_add lo 192.0.2.120/28 + vx10_create_wait local 192.0.2.120 group "$GROUP4" dev "$IPMR" mcroute + bridge fdb del dev vx10 00:00:00:00:00:00 + bridge fdb add dev vx10 00:00:00:00:00:00 self static dst "$GROUP4" + do_test 4 10 10 "IPv4 mcroute TX!=RX oif=0" +} + +ipv4_mcroute_fdb_oif0_sep_rx() +{ + adf_install_sg_sep_rx lo + + ip_addr_add lo 192.0.2.120/28 + vx10_create_wait local 192.0.2.120 group "$GROUP4" dev "$IPMR" mcroute + bridge fdb del dev vx10 00:00:00:00:00:00 + bridge fdb add dev vx10 00:00:00:00:00:00 self static dst "$GROUP4" + ipv4_do_test_rx 0 "IPv4 mcroute TX!=RX oif=0 ping" +} + +ipv4_mcroute_fdb_sep_rx() +{ + adf_install_sg_sep_rx lo + + ip_addr_add lo 192.0.2.120/28 + vx10_create_wait local 192.0.2.120 group "$GROUP4" dev "$IPMR" mcroute + bridge fdb del dev vx10 00:00:00:00:00:00 + bridge fdb add \ + dev vx10 00:00:00:00:00:00 self static dst "$GROUP4" via lo + ipv4_do_test_rx 0 "IPv4 mcroute TX!=RX ping" +} + +ipv6_mcroute_fdb_sep_rx() +{ + adf_install_sg_sep_rx "X$IPMR" + + ip_addr_add "X$IPMR" 2001:db8:5::1/64 + vx20_create_wait local 2001:db8:5::1 group "$GROUP6" dev "$IPMR" mcroute + bridge -6 fdb del dev vx20 00:00:00:00:00:00 + bridge -6 fdb add dev vx20 00:00:00:00:00:00 \ + self static dst "$GROUP6" via "X$IPMR" + ipv6_do_test_rx 0 "IPv6 mcroute TX!=RX ping" +} + +trap cleanup EXIT + +setup_prepare +setup_wait "$NUM_NETIFS" +tests_run + +exit "$EXIT_STATUS"