mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-10 05:39:42 -04:00
Merge branch 'ip-improve-tcp-sock-multipath-routing'
Willem de Bruijn says: ==================== ip: improve tcp sock multipath routing From: Willem de Bruijn <willemb@google.com> Improve layer 4 multipath hash policy for local tcp connections: patch 1: Select a source address that matches the nexthop device. Due to tcp_v4_connect making separate route lookups for saddr and route, the two can currently be inconsistent. patch 2: Use all paths when opening multiple local tcp connections to the same ip address and port. patch 3: Test the behavior. Extend the fib_tests.sh testsuite with one opening many connections, and count SYNs on both egress devices, for packets matching the source address of the dev. Changelog in the individual patches ==================== Link: https://patch.msgid.link/20250424143549.669426-1-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
@@ -39,6 +39,7 @@ struct flowi_common {
|
||||
#define FLOWI_FLAG_ANYSRC 0x01
|
||||
#define FLOWI_FLAG_KNOWN_NH 0x02
|
||||
#define FLOWI_FLAG_L3MDEV_OIF 0x04
|
||||
#define FLOWI_FLAG_ANY_SPORT 0x08
|
||||
__u32 flowic_secid;
|
||||
kuid_t flowic_uid;
|
||||
__u32 flowic_multipath_hash;
|
||||
|
||||
@@ -574,7 +574,8 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net,
|
||||
|
||||
int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
|
||||
struct netlink_ext_ack *extack);
|
||||
void fib_select_multipath(struct fib_result *res, int hash);
|
||||
void fib_select_multipath(struct fib_result *res, int hash,
|
||||
const struct flowi4 *fl4);
|
||||
void fib_select_path(struct net *net, struct fib_result *res,
|
||||
struct flowi4 *fl4, const struct sk_buff *skb);
|
||||
|
||||
|
||||
@@ -326,6 +326,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst,
|
||||
if (inet_test_bit(TRANSPARENT, sk))
|
||||
flow_flags |= FLOWI_FLAG_ANYSRC;
|
||||
|
||||
if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport)
|
||||
flow_flags |= FLOWI_FLAG_ANY_SPORT;
|
||||
|
||||
flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk),
|
||||
ip_sock_rt_scope(sk), protocol, flow_flags, dst,
|
||||
src, dport, sport, sk->sk_uid);
|
||||
|
||||
@@ -2170,34 +2170,45 @@ static bool fib_good_nh(const struct fib_nh *nh)
|
||||
return !!(state & NUD_VALID);
|
||||
}
|
||||
|
||||
void fib_select_multipath(struct fib_result *res, int hash)
|
||||
void fib_select_multipath(struct fib_result *res, int hash,
|
||||
const struct flowi4 *fl4)
|
||||
{
|
||||
struct fib_info *fi = res->fi;
|
||||
struct net *net = fi->fib_net;
|
||||
bool first = false;
|
||||
bool found = false;
|
||||
bool use_neigh;
|
||||
__be32 saddr;
|
||||
|
||||
if (unlikely(res->fi->nh)) {
|
||||
nexthop_path_fib_result(res, hash);
|
||||
return;
|
||||
}
|
||||
|
||||
use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh);
|
||||
saddr = fl4 ? fl4->saddr : 0;
|
||||
|
||||
change_nexthops(fi) {
|
||||
if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) {
|
||||
if (!fib_good_nh(nexthop_nh))
|
||||
continue;
|
||||
if (!first) {
|
||||
res->nh_sel = nhsel;
|
||||
res->nhc = &nexthop_nh->nh_common;
|
||||
first = true;
|
||||
}
|
||||
if (use_neigh && !fib_good_nh(nexthop_nh))
|
||||
continue;
|
||||
|
||||
if (!found) {
|
||||
res->nh_sel = nhsel;
|
||||
res->nhc = &nexthop_nh->nh_common;
|
||||
found = !saddr || nexthop_nh->nh_saddr == saddr;
|
||||
}
|
||||
|
||||
if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound))
|
||||
continue;
|
||||
|
||||
res->nh_sel = nhsel;
|
||||
res->nhc = &nexthop_nh->nh_common;
|
||||
return;
|
||||
if (!saddr || nexthop_nh->nh_saddr == saddr) {
|
||||
res->nh_sel = nhsel;
|
||||
res->nhc = &nexthop_nh->nh_common;
|
||||
return;
|
||||
}
|
||||
|
||||
if (found)
|
||||
return;
|
||||
|
||||
} endfor_nexthops(fi);
|
||||
}
|
||||
#endif
|
||||
@@ -2212,7 +2223,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
|
||||
if (fib_info_num_path(res->fi) > 1) {
|
||||
int h = fib_multipath_hash(net, fl4, skb, NULL);
|
||||
|
||||
fib_select_multipath(res, h);
|
||||
fib_select_multipath(res, h, fl4);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
|
||||
@@ -2037,8 +2037,12 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net,
|
||||
hash_keys.addrs.v4addrs.dst = fl4->daddr;
|
||||
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
|
||||
hash_keys.basic.ip_proto = fl4->flowi4_proto;
|
||||
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
|
||||
hash_keys.ports.src = fl4->fl4_sport;
|
||||
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
|
||||
if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
|
||||
hash_keys.ports.src = (__force __be16)get_random_u16();
|
||||
else
|
||||
hash_keys.ports.src = fl4->fl4_sport;
|
||||
}
|
||||
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
|
||||
hash_keys.ports.dst = fl4->fl4_dport;
|
||||
|
||||
@@ -2093,7 +2097,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
|
||||
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
|
||||
hash_keys.addrs.v4addrs.src = fl4->saddr;
|
||||
hash_keys.addrs.v4addrs.dst = fl4->daddr;
|
||||
hash_keys.ports.src = fl4->fl4_sport;
|
||||
if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
|
||||
hash_keys.ports.src = (__force __be16)get_random_u16();
|
||||
else
|
||||
hash_keys.ports.src = fl4->fl4_sport;
|
||||
hash_keys.ports.dst = fl4->fl4_dport;
|
||||
hash_keys.basic.ip_proto = fl4->flowi4_proto;
|
||||
}
|
||||
@@ -2154,7 +2161,7 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res,
|
||||
if (res->fi && fib_info_num_path(res->fi) > 1) {
|
||||
int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
|
||||
|
||||
fib_select_multipath(res, h);
|
||||
fib_select_multipath(res, h, NULL);
|
||||
IPCB(skb)->flags |= IPSKB_MULTIPATH;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -2492,8 +2492,12 @@ static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
|
||||
hash_keys.basic.ip_proto = fl6->flowi6_proto;
|
||||
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
|
||||
hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
|
||||
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
|
||||
hash_keys.ports.src = fl6->fl6_sport;
|
||||
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
|
||||
if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
|
||||
hash_keys.ports.src = (__force __be16)get_random_u16();
|
||||
else
|
||||
hash_keys.ports.src = fl6->fl6_sport;
|
||||
}
|
||||
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
|
||||
hash_keys.ports.dst = fl6->fl6_dport;
|
||||
|
||||
@@ -2547,7 +2551,10 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
|
||||
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
|
||||
hash_keys.addrs.v6addrs.src = fl6->saddr;
|
||||
hash_keys.addrs.v6addrs.dst = fl6->daddr;
|
||||
hash_keys.ports.src = fl6->fl6_sport;
|
||||
if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
|
||||
hash_keys.ports.src = (__force __be16)get_random_u16();
|
||||
else
|
||||
hash_keys.ports.src = fl6->fl6_sport;
|
||||
hash_keys.ports.dst = fl6->fl6_dport;
|
||||
hash_keys.basic.ip_proto = fl6->flowi6_proto;
|
||||
}
|
||||
|
||||
@@ -267,6 +267,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
|
||||
fl6.flowi6_mark = sk->sk_mark;
|
||||
fl6.fl6_dport = usin->sin6_port;
|
||||
fl6.fl6_sport = inet->inet_sport;
|
||||
if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport)
|
||||
fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT;
|
||||
fl6.flowi6_uid = sk->sk_uid;
|
||||
|
||||
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
|
||||
|
||||
@@ -11,7 +11,7 @@ TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify \
|
||||
ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics \
|
||||
ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \
|
||||
ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \
|
||||
ipv4_mpath_list ipv6_mpath_list"
|
||||
ipv4_mpath_list ipv6_mpath_list ipv4_mpath_balance ipv6_mpath_balance"
|
||||
|
||||
VERBOSE=0
|
||||
PAUSE_ON_FAIL=no
|
||||
@@ -1085,6 +1085,35 @@ route_setup()
|
||||
set +e
|
||||
}
|
||||
|
||||
forwarding_cleanup()
|
||||
{
|
||||
cleanup_ns $ns3
|
||||
|
||||
route_cleanup
|
||||
}
|
||||
|
||||
# extend route_setup with an ns3 reachable through ns2 over both devices
|
||||
forwarding_setup()
|
||||
{
|
||||
forwarding_cleanup
|
||||
|
||||
route_setup
|
||||
|
||||
setup_ns ns3
|
||||
|
||||
ip link add veth5 netns $ns3 type veth peer name veth6 netns $ns2
|
||||
ip -netns $ns3 link set veth5 up
|
||||
ip -netns $ns2 link set veth6 up
|
||||
|
||||
ip -netns $ns3 -4 addr add dev veth5 172.16.105.1/24
|
||||
ip -netns $ns2 -4 addr add dev veth6 172.16.105.2/24
|
||||
ip -netns $ns3 -4 route add 172.16.100.0/22 via 172.16.105.2
|
||||
|
||||
ip -netns $ns3 -6 addr add dev veth5 2001:db8:105::1/64 nodad
|
||||
ip -netns $ns2 -6 addr add dev veth6 2001:db8:105::2/64 nodad
|
||||
ip -netns $ns3 -6 route add 2001:db8:101::/33 via 2001:db8:105::2
|
||||
}
|
||||
|
||||
# assumption is that basic add of a single path route works
|
||||
# otherwise just adding an address on an interface is broken
|
||||
ipv6_rt_add()
|
||||
@@ -2600,6 +2629,93 @@ ipv6_mpath_list_test()
|
||||
route_cleanup
|
||||
}
|
||||
|
||||
tc_set_flower_counter__saddr_syn() {
|
||||
tc_set_flower_counter $1 $2 $3 "src_ip $4 ip_proto tcp tcp_flags 0x2"
|
||||
}
|
||||
|
||||
ip_mpath_balance_dep_check()
|
||||
{
|
||||
if [ ! -x "$(command -v socat)" ]; then
|
||||
echo "socat command not found. Skipping test"
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [ ! -x "$(command -v jq)" ]; then
|
||||
echo "jq command not found. Skipping test"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
ip_mpath_balance() {
|
||||
local -r ipver=$1
|
||||
local -r daddr=$2
|
||||
local -r num_conn=20
|
||||
|
||||
for i in $(seq 1 $num_conn); do
|
||||
ip netns exec $ns3 socat $ipver TCP-LISTEN:8000 STDIO >/dev/null &
|
||||
sleep 0.02
|
||||
echo -n a | ip netns exec $ns1 socat $ipver STDIO TCP:$daddr:8000
|
||||
done
|
||||
|
||||
local -r syn0="$(tc_get_flower_counter $ns1 veth1)"
|
||||
local -r syn1="$(tc_get_flower_counter $ns1 veth3)"
|
||||
local -r syns=$((syn0+syn1))
|
||||
|
||||
[ "$VERBOSE" = "1" ] && echo "multipath: syns seen: ($syn0,$syn1)"
|
||||
|
||||
[[ $syns -ge $num_conn ]] && [[ $syn0 -gt 0 ]] && [[ $syn1 -gt 0 ]]
|
||||
}
|
||||
|
||||
ipv4_mpath_balance_test()
|
||||
{
|
||||
echo
|
||||
echo "IPv4 multipath load balance test"
|
||||
|
||||
ip_mpath_balance_dep_check || return 1
|
||||
forwarding_setup
|
||||
|
||||
$IP route add 172.16.105.1 \
|
||||
nexthop via 172.16.101.2 \
|
||||
nexthop via 172.16.103.2
|
||||
|
||||
ip netns exec $ns1 \
|
||||
sysctl -q -w net.ipv4.fib_multipath_hash_policy=1
|
||||
|
||||
tc_set_flower_counter__saddr_syn $ns1 4 veth1 172.16.101.1
|
||||
tc_set_flower_counter__saddr_syn $ns1 4 veth3 172.16.103.1
|
||||
|
||||
ip_mpath_balance -4 172.16.105.1
|
||||
|
||||
log_test $? 0 "IPv4 multipath loadbalance"
|
||||
|
||||
forwarding_cleanup
|
||||
}
|
||||
|
||||
ipv6_mpath_balance_test()
|
||||
{
|
||||
echo
|
||||
echo "IPv6 multipath load balance test"
|
||||
|
||||
ip_mpath_balance_dep_check || return 1
|
||||
forwarding_setup
|
||||
|
||||
$IP route add 2001:db8:105::1\
|
||||
nexthop via 2001:db8:101::2 \
|
||||
nexthop via 2001:db8:103::2
|
||||
|
||||
ip netns exec $ns1 \
|
||||
sysctl -q -w net.ipv6.fib_multipath_hash_policy=1
|
||||
|
||||
tc_set_flower_counter__saddr_syn $ns1 6 veth1 2001:db8:101::1
|
||||
tc_set_flower_counter__saddr_syn $ns1 6 veth3 2001:db8:103::1
|
||||
|
||||
ip_mpath_balance -6 "[2001:db8:105::1]"
|
||||
|
||||
log_test $? 0 "IPv6 multipath loadbalance"
|
||||
|
||||
forwarding_cleanup
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# usage
|
||||
|
||||
@@ -2683,6 +2799,8 @@ do
|
||||
fib6_gc_test|ipv6_gc) fib6_gc_test;;
|
||||
ipv4_mpath_list) ipv4_mpath_list_test;;
|
||||
ipv6_mpath_list) ipv6_mpath_list_test;;
|
||||
ipv4_mpath_balance) ipv4_mpath_balance_test;;
|
||||
ipv6_mpath_balance) ipv6_mpath_balance_test;;
|
||||
|
||||
help) echo "Test names: $TESTS"; exit 0;;
|
||||
esac
|
||||
|
||||
@@ -270,6 +270,30 @@ tc_rule_handle_stats_get()
|
||||
.options.actions[0].stats$selector"
|
||||
}
|
||||
|
||||
# attach a qdisc with two children match/no-match and a flower filter to match
|
||||
tc_set_flower_counter() {
|
||||
local -r ns=$1
|
||||
local -r ipver=$2
|
||||
local -r dev=$3
|
||||
local -r flower_expr=$4
|
||||
|
||||
tc -n $ns qdisc add dev $dev root handle 1: prio bands 2 \
|
||||
priomap 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
|
||||
tc -n $ns qdisc add dev $dev parent 1:1 handle 11: pfifo
|
||||
tc -n $ns qdisc add dev $dev parent 1:2 handle 12: pfifo
|
||||
|
||||
tc -n $ns filter add dev $dev parent 1: protocol ipv$ipver \
|
||||
flower $flower_expr classid 1:2
|
||||
}
|
||||
|
||||
tc_get_flower_counter() {
|
||||
local -r ns=$1
|
||||
local -r dev=$2
|
||||
|
||||
tc -n $ns -j -s qdisc show dev $dev handle 12: | jq .[0].packets
|
||||
}
|
||||
|
||||
ret_set_ksft_status()
|
||||
{
|
||||
local ksft_status=$1; shift
|
||||
|
||||
Reference in New Issue
Block a user