Merge branch 'ip-improve-tcp-sock-multipath-routing'

Willem de Bruijn says:

====================
ip: improve tcp sock multipath routing

From: Willem de Bruijn <willemb@google.com>

Improve layer 4 multipath hash policy for local tcp connections:

patch 1: Select a source address that matches the nexthop device.
         Due to tcp_v4_connect making separate route lookups for saddr
         and route, the two can currently be inconsistent.

patch 2: Use all paths when opening multiple local tcp connections to
         the same ip address and port.

patch 3: Test the behavior. Extend the fib_tests.sh testsuite with one
         opening many connections, and count SYNs on both egress
         devices, for packets matching the source address of the dev.

Changelog in the individual patches
====================

Link: https://patch.msgid.link/20250424143549.669426-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
Paolo Abeni
2025-04-29 16:22:26 +02:00
9 changed files with 197 additions and 23 deletions

View File

@@ -39,6 +39,7 @@ struct flowi_common {
#define FLOWI_FLAG_ANYSRC 0x01
#define FLOWI_FLAG_KNOWN_NH 0x02
#define FLOWI_FLAG_L3MDEV_OIF 0x04
#define FLOWI_FLAG_ANY_SPORT 0x08
__u32 flowic_secid;
kuid_t flowic_uid;
__u32 flowic_multipath_hash;

View File

@@ -574,7 +574,8 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net,
int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
struct netlink_ext_ack *extack);
void fib_select_multipath(struct fib_result *res, int hash);
void fib_select_multipath(struct fib_result *res, int hash,
const struct flowi4 *fl4);
void fib_select_path(struct net *net, struct fib_result *res,
struct flowi4 *fl4, const struct sk_buff *skb);

View File

@@ -326,6 +326,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst,
if (inet_test_bit(TRANSPARENT, sk))
flow_flags |= FLOWI_FLAG_ANYSRC;
if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport)
flow_flags |= FLOWI_FLAG_ANY_SPORT;
flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk),
ip_sock_rt_scope(sk), protocol, flow_flags, dst,
src, dport, sport, sk->sk_uid);

View File

@@ -2170,34 +2170,45 @@ static bool fib_good_nh(const struct fib_nh *nh)
return !!(state & NUD_VALID);
}
void fib_select_multipath(struct fib_result *res, int hash)
void fib_select_multipath(struct fib_result *res, int hash,
const struct flowi4 *fl4)
{
struct fib_info *fi = res->fi;
struct net *net = fi->fib_net;
bool first = false;
bool found = false;
bool use_neigh;
__be32 saddr;
if (unlikely(res->fi->nh)) {
nexthop_path_fib_result(res, hash);
return;
}
use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh);
saddr = fl4 ? fl4->saddr : 0;
change_nexthops(fi) {
if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) {
if (!fib_good_nh(nexthop_nh))
continue;
if (!first) {
res->nh_sel = nhsel;
res->nhc = &nexthop_nh->nh_common;
first = true;
}
if (use_neigh && !fib_good_nh(nexthop_nh))
continue;
if (!found) {
res->nh_sel = nhsel;
res->nhc = &nexthop_nh->nh_common;
found = !saddr || nexthop_nh->nh_saddr == saddr;
}
if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound))
continue;
res->nh_sel = nhsel;
res->nhc = &nexthop_nh->nh_common;
return;
if (!saddr || nexthop_nh->nh_saddr == saddr) {
res->nh_sel = nhsel;
res->nhc = &nexthop_nh->nh_common;
return;
}
if (found)
return;
} endfor_nexthops(fi);
}
#endif
@@ -2212,7 +2223,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
if (fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(net, fl4, skb, NULL);
fib_select_multipath(res, h);
fib_select_multipath(res, h, fl4);
}
else
#endif

View File

@@ -2037,8 +2037,12 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net,
hash_keys.addrs.v4addrs.dst = fl4->daddr;
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
hash_keys.basic.ip_proto = fl4->flowi4_proto;
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
hash_keys.ports.src = fl4->fl4_sport;
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
hash_keys.ports.src = (__force __be16)get_random_u16();
else
hash_keys.ports.src = fl4->fl4_sport;
}
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
hash_keys.ports.dst = fl4->fl4_dport;
@@ -2093,7 +2097,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
hash_keys.addrs.v4addrs.src = fl4->saddr;
hash_keys.addrs.v4addrs.dst = fl4->daddr;
hash_keys.ports.src = fl4->fl4_sport;
if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
hash_keys.ports.src = (__force __be16)get_random_u16();
else
hash_keys.ports.src = fl4->fl4_sport;
hash_keys.ports.dst = fl4->fl4_dport;
hash_keys.basic.ip_proto = fl4->flowi4_proto;
}
@@ -2154,7 +2161,7 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res,
if (res->fi && fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
fib_select_multipath(res, h);
fib_select_multipath(res, h, NULL);
IPCB(skb)->flags |= IPSKB_MULTIPATH;
}
#endif

View File

@@ -2492,8 +2492,12 @@ static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
hash_keys.basic.ip_proto = fl6->flowi6_proto;
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
hash_keys.ports.src = fl6->fl6_sport;
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
hash_keys.ports.src = (__force __be16)get_random_u16();
else
hash_keys.ports.src = fl6->fl6_sport;
}
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
hash_keys.ports.dst = fl6->fl6_dport;
@@ -2547,7 +2551,10 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
hash_keys.addrs.v6addrs.src = fl6->saddr;
hash_keys.addrs.v6addrs.dst = fl6->daddr;
hash_keys.ports.src = fl6->fl6_sport;
if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
hash_keys.ports.src = (__force __be16)get_random_u16();
else
hash_keys.ports.src = fl6->fl6_sport;
hash_keys.ports.dst = fl6->fl6_dport;
hash_keys.basic.ip_proto = fl6->flowi6_proto;
}

View File

@@ -267,6 +267,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport)
fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT;
fl6.flowi6_uid = sk->sk_uid;
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));

View File

@@ -11,7 +11,7 @@ TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify \
ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics \
ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \
ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \
ipv4_mpath_list ipv6_mpath_list"
ipv4_mpath_list ipv6_mpath_list ipv4_mpath_balance ipv6_mpath_balance"
VERBOSE=0
PAUSE_ON_FAIL=no
@@ -1085,6 +1085,35 @@ route_setup()
set +e
}
forwarding_cleanup()
{
cleanup_ns $ns3
route_cleanup
}
# extend route_setup with an ns3 reachable through ns2 over both devices
forwarding_setup()
{
forwarding_cleanup
route_setup
setup_ns ns3
ip link add veth5 netns $ns3 type veth peer name veth6 netns $ns2
ip -netns $ns3 link set veth5 up
ip -netns $ns2 link set veth6 up
ip -netns $ns3 -4 addr add dev veth5 172.16.105.1/24
ip -netns $ns2 -4 addr add dev veth6 172.16.105.2/24
ip -netns $ns3 -4 route add 172.16.100.0/22 via 172.16.105.2
ip -netns $ns3 -6 addr add dev veth5 2001:db8:105::1/64 nodad
ip -netns $ns2 -6 addr add dev veth6 2001:db8:105::2/64 nodad
ip -netns $ns3 -6 route add 2001:db8:101::/33 via 2001:db8:105::2
}
# assumption is that basic add of a single path route works
# otherwise just adding an address on an interface is broken
ipv6_rt_add()
@@ -2600,6 +2629,93 @@ ipv6_mpath_list_test()
route_cleanup
}
tc_set_flower_counter__saddr_syn() {
tc_set_flower_counter $1 $2 $3 "src_ip $4 ip_proto tcp tcp_flags 0x2"
}
ip_mpath_balance_dep_check()
{
if [ ! -x "$(command -v socat)" ]; then
echo "socat command not found. Skipping test"
return 1
fi
if [ ! -x "$(command -v jq)" ]; then
echo "jq command not found. Skipping test"
return 1
fi
}
ip_mpath_balance() {
local -r ipver=$1
local -r daddr=$2
local -r num_conn=20
for i in $(seq 1 $num_conn); do
ip netns exec $ns3 socat $ipver TCP-LISTEN:8000 STDIO >/dev/null &
sleep 0.02
echo -n a | ip netns exec $ns1 socat $ipver STDIO TCP:$daddr:8000
done
local -r syn0="$(tc_get_flower_counter $ns1 veth1)"
local -r syn1="$(tc_get_flower_counter $ns1 veth3)"
local -r syns=$((syn0+syn1))
[ "$VERBOSE" = "1" ] && echo "multipath: syns seen: ($syn0,$syn1)"
[[ $syns -ge $num_conn ]] && [[ $syn0 -gt 0 ]] && [[ $syn1 -gt 0 ]]
}
ipv4_mpath_balance_test()
{
echo
echo "IPv4 multipath load balance test"
ip_mpath_balance_dep_check || return 1
forwarding_setup
$IP route add 172.16.105.1 \
nexthop via 172.16.101.2 \
nexthop via 172.16.103.2
ip netns exec $ns1 \
sysctl -q -w net.ipv4.fib_multipath_hash_policy=1
tc_set_flower_counter__saddr_syn $ns1 4 veth1 172.16.101.1
tc_set_flower_counter__saddr_syn $ns1 4 veth3 172.16.103.1
ip_mpath_balance -4 172.16.105.1
log_test $? 0 "IPv4 multipath loadbalance"
forwarding_cleanup
}
ipv6_mpath_balance_test()
{
echo
echo "IPv6 multipath load balance test"
ip_mpath_balance_dep_check || return 1
forwarding_setup
$IP route add 2001:db8:105::1\
nexthop via 2001:db8:101::2 \
nexthop via 2001:db8:103::2
ip netns exec $ns1 \
sysctl -q -w net.ipv6.fib_multipath_hash_policy=1
tc_set_flower_counter__saddr_syn $ns1 6 veth1 2001:db8:101::1
tc_set_flower_counter__saddr_syn $ns1 6 veth3 2001:db8:103::1
ip_mpath_balance -6 "[2001:db8:105::1]"
log_test $? 0 "IPv6 multipath loadbalance"
forwarding_cleanup
}
################################################################################
# usage
@@ -2683,6 +2799,8 @@ do
fib6_gc_test|ipv6_gc) fib6_gc_test;;
ipv4_mpath_list) ipv4_mpath_list_test;;
ipv6_mpath_list) ipv6_mpath_list_test;;
ipv4_mpath_balance) ipv4_mpath_balance_test;;
ipv6_mpath_balance) ipv6_mpath_balance_test;;
help) echo "Test names: $TESTS"; exit 0;;
esac

View File

@@ -270,6 +270,30 @@ tc_rule_handle_stats_get()
.options.actions[0].stats$selector"
}
# attach a qdisc with two children match/no-match and a flower filter to match
tc_set_flower_counter() {
local -r ns=$1
local -r ipver=$2
local -r dev=$3
local -r flower_expr=$4
tc -n $ns qdisc add dev $dev root handle 1: prio bands 2 \
priomap 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
tc -n $ns qdisc add dev $dev parent 1:1 handle 11: pfifo
tc -n $ns qdisc add dev $dev parent 1:2 handle 12: pfifo
tc -n $ns filter add dev $dev parent 1: protocol ipv$ipver \
flower $flower_expr classid 1:2
}
tc_get_flower_counter() {
local -r ns=$1
local -r dev=$2
tc -n $ns -j -s qdisc show dev $dev handle 12: | jq .[0].packets
}
ret_set_ksft_status()
{
local ksft_status=$1; shift