From 32607a332cfea5a4b2a185f3e3d605a9bf4f8df0 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 24 Apr 2025 10:35:18 -0400 Subject: [PATCH 1/3] ipv4: prefer multipath nexthop that matches source address With multipath routes, try to ensure that packets leave on the device that is associated with the source address. Avoid the following tcpdump example: veth0 Out IP 10.1.0.2.38640 > 10.2.0.3.8000: Flags [S] veth1 Out IP 10.1.0.2.38648 > 10.2.0.3.8000: Flags [S] Which can happen easily with the most straightforward setup: ip addr add 10.0.0.1/24 dev veth0 ip addr add 10.1.0.1/24 dev veth1 ip route add 10.2.0.3 nexthop via 10.0.0.2 dev veth0 \ nexthop via 10.1.0.2 dev veth1 This is apparently considered WAI, based on the comment in ip_route_output_key_hash_rcu: * 2. Moreover, we are allowed to send packets with saddr * of another iface. --ANK It may be ok for some uses of multipath, but not all. For instance, when using two ISPs, a router may drop packets with unknown source. The behavior occurs because tcp_v4_connect makes three route lookups when establishing a connection: 1. ip_route_connect calls to select a source address, with saddr zero. 2. ip_route_connect calls again now that saddr and daddr are known. 3. ip_route_newports calls again after a source port is also chosen. With a route with multiple nexthops, each lookup may make a different choice depending on available entropy to fib_select_multipath. So it is possible for 1 to select the saddr from the first entry, but 3 to select the second entry. Leading to the above situation. Address this by preferring a match that matches the flowi4 saddr. This will make 2 and 3 make the same choice as 1. Continue to update the backup choice until a choice that matches saddr is found. Do this in fib_select_multipath itself, rather than passing an fl4_oif constraint, to avoid changing non-multipath route selection. Commit e6b45241c57a ("ipv4: reset flowi parameters on route connect") shows how that may cause regressions. Also read ipv4.sysctl_fib_multipath_use_neigh only once. No need to refresh in the loop. This does not happen in IPv6, which performs only one lookup. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250424143549.669426-2-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- include/net/ip_fib.h | 3 ++- net/ipv4/fib_semantics.c | 39 +++++++++++++++++++++++++-------------- net/ipv4/route.c | 2 +- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e3864b74e92a..48bb3cf41469 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -574,7 +574,8 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net, int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack); -void fib_select_multipath(struct fib_result *res, int hash); +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 5326f1501af0..2371f311a1e1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -2170,34 +2170,45 @@ static bool fib_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -void fib_select_multipath(struct fib_result *res, int hash) +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4) { struct fib_info *fi = res->fi; struct net *net = fi->fib_net; - bool first = false; + bool found = false; + bool use_neigh; + __be32 saddr; if (unlikely(res->fi->nh)) { nexthop_path_fib_result(res, hash); return; } + use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh); + saddr = fl4 ? fl4->saddr : 0; + change_nexthops(fi) { - if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) { - if (!fib_good_nh(nexthop_nh)) - continue; - if (!first) { - res->nh_sel = nhsel; - res->nhc = &nexthop_nh->nh_common; - first = true; - } + if (use_neigh && !fib_good_nh(nexthop_nh)) + continue; + + if (!found) { + res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; + found = !saddr || nexthop_nh->nh_saddr == saddr; } if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound)) continue; - res->nh_sel = nhsel; - res->nhc = &nexthop_nh->nh_common; - return; + if (!saddr || nexthop_nh->nh_saddr == saddr) { + res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; + return; + } + + if (found) + return; + } endfor_nexthops(fi); } #endif @@ -2212,7 +2223,7 @@ void fib_select_path(struct net *net, struct fib_result *res, if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); - fib_select_multipath(res, h); + fib_select_multipath(res, h, fl4); } else #endif diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 49cffbe83802..e5e4c71be3af 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2154,7 +2154,7 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); - fib_select_multipath(res, h); + fib_select_multipath(res, h, NULL); IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif From 65e9024643c7512ade3aedbb341e11d77ed7abc2 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 24 Apr 2025 10:35:19 -0400 Subject: [PATCH 2/3] ip: load balance tcp connections to single dst addr and port Load balance new TCP connections across nexthops also when they connect to the same service at a single remote address and port. This affects only port-based multipath hashing: fib_multipath_hash_policy 1 or 3. Local connections must choose both a source address and port when connecting to a remote service, in ip_route_connect. This "chicken-and-egg problem" (commit 2d7192d6cbab ("ipv4: Sanitize and simplify ip_route_{connect,newports}()")) is resolved by first selecting a source address, by looking up a route using the zero wildcard source port and address. As a result multiple connections to the same destination address and port have no entropy in fib_multipath_hash. This is not a problem when forwarding, as skb-based hashing has a 4-tuple. Nor when establishing UDP connections, as autobind there selects a port before reaching ip_route_connect. Load balance also TCP, by using a random port in fib_multipath_hash. Port assignment in inet_hash_connect is not atomic with ip_route_connect. Thus ports are unpredictable, effectively random. Implementation details: Do not actually pass a random fl4_sport, as that affects not only hashing, but routing more broadly, and can match a source port based policy route, which existing wildcard port 0 will not. Instead, define a new wildcard flowi flag that is used only for hashing. Selecting a random source is equivalent to just selecting a random hash entirely. But for code clarity, follow the normal 4-tuple hash process and only update this field. fib_multipath_hash can be reached with zero sport from other code paths, so explicitly pass this flowi flag, rather than trying to infer this case in the function itself. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250424143549.669426-3-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- include/net/flow.h | 1 + include/net/route.h | 3 +++ net/ipv4/route.c | 13 ++++++++++--- net/ipv6/route.c | 13 ++++++++++--- net/ipv6/tcp_ipv6.c | 2 ++ 5 files changed, 26 insertions(+), 6 deletions(-) diff --git a/include/net/flow.h b/include/net/flow.h index 2a3f0c42f092..a1839c278d87 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -39,6 +39,7 @@ struct flowi_common { #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 #define FLOWI_FLAG_L3MDEV_OIF 0x04 +#define FLOWI_FLAG_ANY_SPORT 0x08 __u32 flowic_secid; kuid_t flowic_uid; __u32 flowic_multipath_hash; diff --git a/include/net/route.h b/include/net/route.h index c605fd5ec0c0..8e39aa822cf9 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -326,6 +326,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, if (inet_test_bit(TRANSPARENT, sk)) flow_flags |= FLOWI_FLAG_ANYSRC; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport) + flow_flags |= FLOWI_FLAG_ANY_SPORT; + flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, src, dport, sport, sk->sk_uid); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e5e4c71be3af..507b2e5dec50 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2037,8 +2037,12 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net, hash_keys.addrs.v4addrs.dst = fl4->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl4->flowi4_proto; - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl4->fl4_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; @@ -2093,7 +2097,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; - hash_keys.ports.src = fl4->fl4_sport; + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d0351e95d916..aa6b45bd3515 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2492,8 +2492,12 @@ static u32 rt6_multipath_custom_hash_fl6(const struct net *net, hash_keys.basic.ip_proto = fl6->flowi6_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl6->fl6_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl6->fl6_dport; @@ -2547,7 +2551,10 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; - hash_keys.ports.src = fl6->fl6_sport; + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7dcb33f879ee..e8e68a142649 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -267,6 +267,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport) + fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT; fl6.flowi6_uid = sk->sk_uid; opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); From 4d0dac499bf384fe3f42acc30906d304c3499dd8 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 24 Apr 2025 10:35:20 -0400 Subject: [PATCH 3/3] selftests/net: test tcp connection load balancing Verify that TCP connections use both routes when connecting multiple times to a remote service over a two nexthop multipath route. Use socat to create the connections. Use tc prio + tc filter to count routes taken, counting SYN packets across the two egress devices. Also verify that the saddr matches that of the device. To avoid flaky tests when testing inherently randomized behavior, set a low bar and pass if even a single SYN is observed on each device. Signed-off-by: Willem de Bruijn Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Link: https://patch.msgid.link/20250424143549.669426-4-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/fib_tests.sh | 120 ++++++++++++++++++++++- tools/testing/selftests/net/lib.sh | 24 +++++ 2 files changed, 143 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 3ea6f886a210..c58dc4ac2810 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -11,7 +11,7 @@ TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify \ ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics \ ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \ ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \ - ipv4_mpath_list ipv6_mpath_list" + ipv4_mpath_list ipv6_mpath_list ipv4_mpath_balance ipv6_mpath_balance" VERBOSE=0 PAUSE_ON_FAIL=no @@ -1085,6 +1085,35 @@ route_setup() set +e } +forwarding_cleanup() +{ + cleanup_ns $ns3 + + route_cleanup +} + +# extend route_setup with an ns3 reachable through ns2 over both devices +forwarding_setup() +{ + forwarding_cleanup + + route_setup + + setup_ns ns3 + + ip link add veth5 netns $ns3 type veth peer name veth6 netns $ns2 + ip -netns $ns3 link set veth5 up + ip -netns $ns2 link set veth6 up + + ip -netns $ns3 -4 addr add dev veth5 172.16.105.1/24 + ip -netns $ns2 -4 addr add dev veth6 172.16.105.2/24 + ip -netns $ns3 -4 route add 172.16.100.0/22 via 172.16.105.2 + + ip -netns $ns3 -6 addr add dev veth5 2001:db8:105::1/64 nodad + ip -netns $ns2 -6 addr add dev veth6 2001:db8:105::2/64 nodad + ip -netns $ns3 -6 route add 2001:db8:101::/33 via 2001:db8:105::2 +} + # assumption is that basic add of a single path route works # otherwise just adding an address on an interface is broken ipv6_rt_add() @@ -2600,6 +2629,93 @@ ipv6_mpath_list_test() route_cleanup } +tc_set_flower_counter__saddr_syn() { + tc_set_flower_counter $1 $2 $3 "src_ip $4 ip_proto tcp tcp_flags 0x2" +} + +ip_mpath_balance_dep_check() +{ + if [ ! -x "$(command -v socat)" ]; then + echo "socat command not found. Skipping test" + return 1 + fi + + if [ ! -x "$(command -v jq)" ]; then + echo "jq command not found. Skipping test" + return 1 + fi +} + +ip_mpath_balance() { + local -r ipver=$1 + local -r daddr=$2 + local -r num_conn=20 + + for i in $(seq 1 $num_conn); do + ip netns exec $ns3 socat $ipver TCP-LISTEN:8000 STDIO >/dev/null & + sleep 0.02 + echo -n a | ip netns exec $ns1 socat $ipver STDIO TCP:$daddr:8000 + done + + local -r syn0="$(tc_get_flower_counter $ns1 veth1)" + local -r syn1="$(tc_get_flower_counter $ns1 veth3)" + local -r syns=$((syn0+syn1)) + + [ "$VERBOSE" = "1" ] && echo "multipath: syns seen: ($syn0,$syn1)" + + [[ $syns -ge $num_conn ]] && [[ $syn0 -gt 0 ]] && [[ $syn1 -gt 0 ]] +} + +ipv4_mpath_balance_test() +{ + echo + echo "IPv4 multipath load balance test" + + ip_mpath_balance_dep_check || return 1 + forwarding_setup + + $IP route add 172.16.105.1 \ + nexthop via 172.16.101.2 \ + nexthop via 172.16.103.2 + + ip netns exec $ns1 \ + sysctl -q -w net.ipv4.fib_multipath_hash_policy=1 + + tc_set_flower_counter__saddr_syn $ns1 4 veth1 172.16.101.1 + tc_set_flower_counter__saddr_syn $ns1 4 veth3 172.16.103.1 + + ip_mpath_balance -4 172.16.105.1 + + log_test $? 0 "IPv4 multipath loadbalance" + + forwarding_cleanup +} + +ipv6_mpath_balance_test() +{ + echo + echo "IPv6 multipath load balance test" + + ip_mpath_balance_dep_check || return 1 + forwarding_setup + + $IP route add 2001:db8:105::1\ + nexthop via 2001:db8:101::2 \ + nexthop via 2001:db8:103::2 + + ip netns exec $ns1 \ + sysctl -q -w net.ipv6.fib_multipath_hash_policy=1 + + tc_set_flower_counter__saddr_syn $ns1 6 veth1 2001:db8:101::1 + tc_set_flower_counter__saddr_syn $ns1 6 veth3 2001:db8:103::1 + + ip_mpath_balance -6 "[2001:db8:105::1]" + + log_test $? 0 "IPv6 multipath loadbalance" + + forwarding_cleanup +} + ################################################################################ # usage @@ -2683,6 +2799,8 @@ do fib6_gc_test|ipv6_gc) fib6_gc_test;; ipv4_mpath_list) ipv4_mpath_list_test;; ipv6_mpath_list) ipv6_mpath_list_test;; + ipv4_mpath_balance) ipv4_mpath_balance_test;; + ipv6_mpath_balance) ipv6_mpath_balance_test;; help) echo "Test names: $TESTS"; exit 0;; esac diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 701905eeff66..7e1e56318625 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -270,6 +270,30 @@ tc_rule_handle_stats_get() .options.actions[0].stats$selector" } +# attach a qdisc with two children match/no-match and a flower filter to match +tc_set_flower_counter() { + local -r ns=$1 + local -r ipver=$2 + local -r dev=$3 + local -r flower_expr=$4 + + tc -n $ns qdisc add dev $dev root handle 1: prio bands 2 \ + priomap 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + + tc -n $ns qdisc add dev $dev parent 1:1 handle 11: pfifo + tc -n $ns qdisc add dev $dev parent 1:2 handle 12: pfifo + + tc -n $ns filter add dev $dev parent 1: protocol ipv$ipver \ + flower $flower_expr classid 1:2 +} + +tc_get_flower_counter() { + local -r ns=$1 + local -r dev=$2 + + tc -n $ns -j -s qdisc show dev $dev handle 12: | jq .[0].packets +} + ret_set_ksft_status() { local ksft_status=$1; shift