Merge branch 'ipv4-icmp-fix-source-ip-derivation-in-presence-of-vrfs'

Ido Schimmel says:

====================
ipv4: icmp: Fix source IP derivation in presence of VRFs

Align IPv4 with IPv6 and in the presence of VRFs generate ICMP error
messages with a source IP that is derived from the receiving interface
and not from its VRF master. This is especially important when the error
messages are "Time Exceeded" messages as it means that utilities like
traceroute will show an incorrect packet path.

Patches #1-#2 are preparations.

Patch #3 is the actual change.

Patches #4-#7 make small improvements in the existing traceroute test.

Patch #8 extends the traceroute test with VRF test cases for both IPv4
and IPv6.

Changes since v1 [1]:
* Rebase.

[1] https://lore.kernel.org/netdev/20250901083027.183468-1-idosch@nvidia.com/
====================

Link: https://patch.msgid.link/20250908073238.119240-1-idosch@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
Paolo Abeni
2025-09-11 12:22:40 +02:00
5 changed files with 229 additions and 69 deletions

View File

@@ -37,10 +37,10 @@ struct sk_buff;
struct net;
void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
const struct ip_options *opt);
const struct inet_skb_parm *parm);
static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
__icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt);
__icmp_send(skb_in, type, code, info, IPCB(skb_in));
}
#if IS_ENABLED(CONFIG_NF_NAT)
@@ -48,8 +48,10 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info);
#else
static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
struct ip_options opts = { 0 };
__icmp_send(skb_in, type, code, info, &opts);
struct inet_skb_parm parm;
memset(&parm, 0, sizeof(parm));
__icmp_send(skb_in, type, code, info, &parm);
}
#endif

View File

@@ -1715,8 +1715,7 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
*/
void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
{
unsigned char optbuf[sizeof(struct ip_options) + 40];
struct ip_options *opt = (struct ip_options *)optbuf;
struct inet_skb_parm parm;
int res;
if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
@@ -1727,19 +1726,19 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
* so we can not use icmp_send and IPCB here.
*/
memset(opt, 0, sizeof(struct ip_options));
opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr);
memset(&parm, 0, sizeof(parm));
parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
rcu_read_lock();
res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL);
res = __ip_options_compile(dev_net(skb->dev), &parm.opt, skb, NULL);
rcu_read_unlock();
if (res)
return;
if (gateway)
__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt);
__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &parm);
else
__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt);
__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &parm);
}
/**

View File

@@ -594,7 +594,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
*/
void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
const struct ip_options *opt)
const struct inet_skb_parm *parm)
{
struct iphdr *iph;
int room;
@@ -710,7 +710,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
rcu_read_lock();
if (rt_is_input_route(rt) &&
READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr))
dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
dev = dev_get_by_index_rcu(net, parm->iif ? parm->iif :
inet_iif(skb_in));
if (dev)
saddr = inet_select_addr(dev, iph->saddr,
@@ -725,7 +726,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt))
if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in,
&parm->opt))
goto out_unlock;
@@ -799,15 +801,16 @@ EXPORT_SYMBOL(__icmp_send);
void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
struct sk_buff *cloned_skb = NULL;
struct ip_options opts = { 0 };
enum ip_conntrack_info ctinfo;
enum ip_conntrack_dir dir;
struct inet_skb_parm parm;
struct nf_conn *ct;
__be32 orig_ip;
memset(&parm, 0, sizeof(parm));
ct = nf_ct_get(skb_in, &ctinfo);
if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) {
__icmp_send(skb_in, type, code, info, &opts);
__icmp_send(skb_in, type, code, info, &parm);
return;
}
@@ -823,7 +826,7 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
orig_ip = ip_hdr(skb_in)->saddr;
dir = CTINFO2DIR(ctinfo);
ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip;
__icmp_send(skb_in, type, code, info, &opts);
__icmp_send(skb_in, type, code, info, &parm);
ip_hdr(skb_in)->saddr = orig_ip;
out:
consume_skb(cloned_skb);

View File

@@ -1222,8 +1222,8 @@ EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
static void ipv4_send_dest_unreach(struct sk_buff *skb)
{
struct inet_skb_parm parm;
struct net_device *dev;
struct ip_options opt;
int res;
/* Recompile ip options since IPCB may not be valid anymore.
@@ -1233,21 +1233,21 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb)
ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
return;
memset(&opt, 0, sizeof(opt));
memset(&parm, 0, sizeof(parm));
if (ip_hdr(skb)->ihl > 5) {
if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
return;
opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
rcu_read_lock();
dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL);
rcu_read_unlock();
if (res)
return;
}
__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm);
}
static void ipv4_link_failure(struct sk_buff *skb)

View File

@@ -10,28 +10,6 @@ PAUSE_ON_FAIL=no
################################################################################
#
log_test()
{
local rc=$1
local expected=$2
local msg="$3"
if [ ${rc} -eq ${expected} ]; then
printf "TEST: %-60s [ OK ]\n" "${msg}"
nsuccess=$((nsuccess+1))
else
ret=1
nfail=$((nfail+1))
printf "TEST: %-60s [FAIL]\n" "${msg}"
if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
echo
echo "hit enter to continue, 'q' to quit"
read a
[ "$a" = "q" ] && exit 1
fi
fi
}
run_cmd()
{
local ns
@@ -203,34 +181,137 @@ setup_traceroute6()
run_traceroute6()
{
if [ ! -x "$(command -v traceroute6)" ]; then
echo "SKIP: Could not run IPV6 test without traceroute6"
return
fi
setup_traceroute6
RET=0
# traceroute6 host-2 from host-1 (expects 2000:102::2)
run_cmd $h1 "traceroute6 2000:103::4 | grep -q 2000:102::2"
log_test $? 0 "IPV6 traceroute"
check_err $? "traceroute6 did not return 2000:102::2"
log_test "IPv6 traceroute"
cleanup_traceroute6
}
################################################################################
# traceroute6 with VRF test
#
# Verify that in this scenario
#
# ------------------------ N2
# | |
# ------ ------ N3 ----
# | R1 | | R2 |------|H2|
# ------ ------ ----
# | |
# ------------------------ N1
# |
# ----
# |H1|
# ----
#
# Where H1's default route goes through R1 and R1's default route goes through
# R2 over N2, traceroute6 from H1 to H2 reports R2's address on N2 and not N1.
# The interfaces connecting R2 to the different subnets are membmer in a VRF
# and the intention is to check that traceroute6 does not report the VRF's
# address.
#
# Addresses are assigned as follows:
#
# N1: 2000:101::/64
# N2: 2000:102::/64
# N3: 2000:103::/64
#
# R1's host part of address: 1
# R2's host part of address: 2
# H1's host part of address: 3
# H2's host part of address: 4
#
# For example:
# the IPv6 address of R1's interface on N2 is 2000:102::1/64
cleanup_traceroute6_vrf()
{
cleanup_all_ns
}
setup_traceroute6_vrf()
{
# Start clean
cleanup_traceroute6_vrf
setup_ns h1 h2 r1 r2
create_ns "$h1"
create_ns "$h2"
create_ns "$r1"
create_ns "$r2"
ip -n "$r2" link add name vrf100 up type vrf table 100
ip -n "$r2" addr add 2001:db8:100::1/64 dev vrf100
# Setup N3
connect_ns "$r2" eth3 - 2000:103::2/64 "$h2" eth3 - 2000:103::4/64
ip -n "$r2" link set dev eth3 master vrf100
ip -n "$h2" route add default via 2000:103::2
# Setup N2
connect_ns "$r1" eth2 - 2000:102::1/64 "$r2" eth2 - 2000:102::2/64
ip -n "$r1" route add default via 2000:102::2
ip -n "$r2" link set dev eth2 master vrf100
# Setup N1. host-1 and router-2 connect to a bridge in router-1.
ip -n "$r1" link add name br100 up type bridge
ip -n "$r1" addr add 2000:101::1/64 dev br100
connect_ns "$h1" eth0 - 2000:101::3/64 "$r1" eth0 - -
ip -n "$h1" route add default via 2000:101::1
ip -n "$r1" link set dev eth0 master br100
connect_ns "$r2" eth1 - 2000:101::2/64 "$r1" eth1 - -
ip -n "$r2" link set dev eth1 master vrf100
ip -n "$r1" link set dev eth1 master br100
# Prime the network
ip netns exec "$h1" ping6 -c5 2000:103::4 >/dev/null 2>&1
}
run_traceroute6_vrf()
{
setup_traceroute6_vrf
RET=0
# traceroute6 host-2 from host-1 (expects 2000:102::2)
run_cmd "$h1" "traceroute6 2000:103::4 | grep 2000:102::2"
check_err $? "traceroute6 did not return 2000:102::2"
log_test "IPv6 traceroute with VRF"
cleanup_traceroute6_vrf
}
################################################################################
# traceroute test
#
# Verify that traceroute from H1 to H2 shows 1.0.1.1 in this scenario
# Verify that traceroute from H1 to H2 shows 1.0.3.1 and 1.0.1.1 when
# traceroute uses 1.0.3.3 and 1.0.1.3 as the source IP, respectively.
#
# 1.0.3.1/24
# 1.0.3.3/24 1.0.3.1/24
# ---- 1.0.1.3/24 1.0.1.1/24 ---- 1.0.2.1/24 1.0.2.4/24 ----
# |H1|--------------------------|R1|--------------------------|H2|
# ---- N1 ---- N2 ----
#
# where net.ipv4.icmp_errors_use_inbound_ifaddr is set on R1 and
# 1.0.3.1/24 and 1.0.1.1/24 are respectively R1's primary and secondary
# address on N1.
#
# where net.ipv4.icmp_errors_use_inbound_ifaddr is set on R1 and 1.0.3.1/24 and
# 1.0.1.1/24 are R1's primary addresses on N1. The kernel is expected to prefer
# a source address that is on the same subnet as the destination IP of the ICMP
# error message.
cleanup_traceroute()
{
@@ -250,6 +331,7 @@ setup_traceroute()
connect_ns $h1 eth0 1.0.1.3/24 - \
$router eth1 1.0.3.1/24 -
ip -n "$h1" addr add 1.0.3.3/24 dev eth0
ip netns exec $h1 ip route add default via 1.0.1.1
ip netns exec $router ip addr add 1.0.1.1/24 dev eth1
@@ -268,35 +350,107 @@ setup_traceroute()
run_traceroute()
{
if [ ! -x "$(command -v traceroute)" ]; then
echo "SKIP: Could not run IPV4 test without traceroute"
return
fi
setup_traceroute
# traceroute host-2 from host-1 (expects 1.0.1.1). Takes a while.
run_cmd $h1 "traceroute 1.0.2.4 | grep -q 1.0.1.1"
log_test $? 0 "IPV4 traceroute"
RET=0
# traceroute host-2 from host-1. Expect a source IP that is on the same
# subnet as destination IP of the ICMP error message.
run_cmd "$h1" "traceroute -s 1.0.1.3 1.0.2.4 | grep -q 1.0.1.1"
check_err $? "traceroute did not return 1.0.1.1"
run_cmd "$h1" "traceroute -s 1.0.3.3 1.0.2.4 | grep -q 1.0.3.1"
check_err $? "traceroute did not return 1.0.3.1"
log_test "IPv4 traceroute"
cleanup_traceroute
}
################################################################################
# traceroute with VRF test
#
# Verify that traceroute from H1 to H2 shows 1.0.3.1 and 1.0.1.1 when
# traceroute uses 1.0.3.3 and 1.0.1.3 as the source IP, respectively. The
# intention is to check that the kernel does not choose an IP assigned to the
# VRF device, but rather an address from the VRF port (eth1) that received the
# packet that generates the ICMP error message.
#
# 1.0.4.1/24 (vrf100)
# 1.0.3.3/24 1.0.3.1/24
# ---- 1.0.1.3/24 1.0.1.1/24 ---- 1.0.2.1/24 1.0.2.4/24 ----
# |H1|--------------------------|R1|--------------------------|H2|
# ---- N1 ---- N2 ----
cleanup_traceroute_vrf()
{
cleanup_all_ns
}
setup_traceroute_vrf()
{
# Start clean
cleanup_traceroute_vrf
setup_ns h1 h2 router
create_ns "$h1"
create_ns "$h2"
create_ns "$router"
ip -n "$router" link add name vrf100 up type vrf table 100
ip -n "$router" addr add 1.0.4.1/24 dev vrf100
connect_ns "$h1" eth0 1.0.1.3/24 - \
"$router" eth1 1.0.1.1/24 -
ip -n "$h1" addr add 1.0.3.3/24 dev eth0
ip -n "$h1" route add default via 1.0.1.1
ip -n "$router" link set dev eth1 master vrf100
ip -n "$router" addr add 1.0.3.1/24 dev eth1
ip netns exec "$router" sysctl -qw \
net.ipv4.icmp_errors_use_inbound_ifaddr=1
connect_ns "$h2" eth0 1.0.2.4/24 - \
"$router" eth2 1.0.2.1/24 -
ip -n "$h2" route add default via 1.0.2.1
ip -n "$router" link set dev eth2 master vrf100
# Prime the network
ip netns exec "$h1" ping -c5 1.0.2.4 >/dev/null 2>&1
}
run_traceroute_vrf()
{
setup_traceroute_vrf
RET=0
# traceroute host-2 from host-1. Expect a source IP that is on the same
# subnet as destination IP of the ICMP error message.
run_cmd "$h1" "traceroute -s 1.0.1.3 1.0.2.4 | grep 1.0.1.1"
check_err $? "traceroute did not return 1.0.1.1"
run_cmd "$h1" "traceroute -s 1.0.3.3 1.0.2.4 | grep 1.0.3.1"
check_err $? "traceroute did not return 1.0.3.1"
log_test "IPv4 traceroute with VRF"
cleanup_traceroute_vrf
}
################################################################################
# Run tests
run_tests()
{
run_traceroute6
run_traceroute6_vrf
run_traceroute
run_traceroute_vrf
}
################################################################################
# main
declare -i nfail=0
declare -i nsuccess=0
while getopts :pv o
do
case $o in
@@ -306,7 +460,9 @@ do
esac
done
require_command traceroute6
require_command traceroute
run_tests
printf "\nTests passed: %3d\n" ${nsuccess}
printf "Tests failed: %3d\n" ${nfail}
exit "${EXIT_STATUS}"