From 0cd33c5ffec12bd77a1c02db2469fac08f840939 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:38 -0800 Subject: [PATCH 1/7] selftests: mptcp: fix diag instability Instead of waiting for an arbitrary amount of time for the MPTCP MP_CAPABLE handshake to complete, explicitly wait for the relevant socket to enter into the established status. Additionally let the data transfer application use the slowest transfer mode available (-r), to cope with very slow host, or high jitter caused by hosting VMs. Fixes: df62f2ec3df6 ("selftests/mptcp: add diag interface tests") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/258 Reported-and-tested-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/diag.sh | 44 +++++++++++++++++++---- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 2674ba20d524..ff821025d309 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -71,6 +71,36 @@ chk_msk_remote_key_nr() __chk_nr "grep -c remote_key" $* } +# $1: ns, $2: port +wait_local_port_listen() +{ + local listener_ns="${1}" + local port="${2}" + + local port_hex i + + port_hex="$(printf "%04X" "${port}")" + for i in $(seq 10); do + ip netns exec "${listener_ns}" cat /proc/net/tcp | \ + awk "BEGIN {rc=1} {if (\$2 ~ /:${port_hex}\$/ && \$4 ~ /0A/) {rc=0; exit}} END {exit rc}" && + break + sleep 0.1 + done +} + +wait_connected() +{ + local listener_ns="${1}" + local port="${2}" + + local port_hex i + + port_hex="$(printf "%04X" "${port}")" + for i in $(seq 10); do + ip netns exec ${listener_ns} grep -q " 0100007F:${port_hex} " /proc/net/tcp && break + sleep 0.1 + done +} trap cleanup EXIT ip netns add $ns @@ -81,15 +111,15 @@ echo "a" | \ ip netns exec $ns \ ./mptcp_connect -p 10000 -l -t ${timeout_poll} \ 0.0.0.0 >/dev/null & -sleep 0.1 +wait_local_port_listen $ns 10000 chk_msk_nr 0 "no msk on netns creation" echo "b" | \ timeout ${timeout_test} \ ip netns exec $ns \ - ./mptcp_connect -p 10000 -j -t ${timeout_poll} \ + ./mptcp_connect -p 10000 -r 0 -t ${timeout_poll} \ 127.0.0.1 >/dev/null & -sleep 0.1 +wait_connected $ns 10000 chk_msk_nr 2 "after MPC handshake " chk_msk_remote_key_nr 2 "....chk remote_key" chk_msk_fallback_nr 0 "....chk no fallback" @@ -101,13 +131,13 @@ echo "a" | \ ip netns exec $ns \ ./mptcp_connect -p 10001 -l -s TCP -t ${timeout_poll} \ 0.0.0.0 >/dev/null & -sleep 0.1 +wait_local_port_listen $ns 10001 echo "b" | \ timeout ${timeout_test} \ ip netns exec $ns \ - ./mptcp_connect -p 10001 -j -t ${timeout_poll} \ + ./mptcp_connect -p 10001 -r 0 -t ${timeout_poll} \ 127.0.0.1 >/dev/null & -sleep 0.1 +wait_connected $ns 10001 chk_msk_fallback_nr 1 "check fallback" flush_pids @@ -119,7 +149,7 @@ for I in `seq 1 $NR_CLIENTS`; do ./mptcp_connect -p $((I+10001)) -l -w 10 \ -t ${timeout_poll} 0.0.0.0 >/dev/null & done -sleep 0.1 +wait_local_port_listen $ns $((NR_CLIENTS + 10001)) for I in `seq 1 $NR_CLIENTS`; do echo "b" | \ From 5b31dda736e31c58d1941c7349569c7452eafb6b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:39 -0800 Subject: [PATCH 2/7] selftests: mptcp: improve 'fair usage on close' stability The mentioned test has to wait for a subflow creation failure. The current code looks for TCP sockets in TW state and sometimes misses the relevant event. Switch to a more stable check, looking for the associated mib counter. Fixes: 46e967d187ed ("selftests: mptcp: add tests for subflow creation failure") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/257 Reported-and-tested-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index c0801df15f54..10b3bd805ac6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -961,7 +961,7 @@ wait_for_tw() local ns=$1 while [ $time -lt $timeout_ms ]; do - local cnt=$(ip netns exec $ns ss -t state time-wait |wc -l) + local cnt=$(ip netns exec $ns nstat -as TcpAttemptFails | grep TcpAttemptFails | awk '{print $2}') [ "$cnt" = 1 ] && return 1 time=$((time + 100)) From 98247bc16a27cf8ead4c47ce9f15888be85841fc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:40 -0800 Subject: [PATCH 3/7] mptcp: fix race in overlapping signal events After commit a88c9e496937 ("mptcp: do not block subflows creation on errors"), if a signal address races with a failing subflow creation, the subflow creation failure control path can trigger the selection of the next address to be announced while the current announced is still pending. The above will cause the unintended suppression of the ADD_ADDR announce. Fix the issue skipping the to-be-suppressed announce before it will mark an endpoint as already used. The relevant announce will be triggered again when the current one will complete. Fixes: a88c9e496937 ("mptcp: do not block subflows creation on errors") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 356f596e2032..82f82a513f5b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -546,6 +546,16 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (msk->pm.add_addr_signaled < add_addr_signal_max) { local = select_signal_address(pernet, msk); + /* due to racing events on both ends we can reach here while + * previous add address is still running: if we invoke now + * mptcp_pm_announce_addr(), that will fail and the + * corresponding id will be marked as used. + * Instead let the PM machinery reschedule us when the + * current address announce will be completed. + */ + if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) + return; + if (local) { if (mptcp_pm_alloc_anno_list(msk, local)) { __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); From 837cf45df163a3780bc04b555700231e95b31dc9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:41 -0800 Subject: [PATCH 4/7] mptcp: fix race in incoming ADD_ADDR option processing If an MPTCP endpoint received multiple consecutive incoming ADD_ADDR options, mptcp_pm_add_addr_received() can overwrite the current remote address value after the PM lock is released in mptcp_pm_nl_add_addr_received() and before such address is echoed. Fix the issue caching the remote address value a little earlier and always using the cached value after releasing the PM lock. Fixes: f7efc7771eac ("mptcp: drop argument port from mptcp_pm_announce_addr") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 82f82a513f5b..4b5d795383cd 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -660,6 +660,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) unsigned int add_addr_accept_max; struct mptcp_addr_info remote; unsigned int subflows_max; + bool reset_port = false; int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); @@ -669,15 +670,19 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) msk->pm.add_addr_accepted, add_addr_accept_max, msk->pm.remote.family); - if (lookup_subflow_by_daddr(&msk->conn_list, &msk->pm.remote)) + remote = msk->pm.remote; + if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) goto add_addr_echo; + /* pick id 0 port, if none is provided the remote address */ + if (!remote.port) { + reset_port = true; + remote.port = sk->sk_dport; + } + /* connect to the specified remote address, using whatever * local address the routing configuration will pick. */ - remote = msk->pm.remote; - if (!remote.port) - remote.port = sk->sk_dport; nr = fill_local_addresses_vec(msk, addrs); msk->pm.add_addr_accepted++; @@ -690,8 +695,12 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) __mptcp_subflow_connect(sk, &addrs[i], &remote); spin_lock_bh(&msk->pm.lock); + /* be sure to echo exactly the received address */ + if (reset_port) + remote.port = 0; + add_addr_echo: - mptcp_pm_announce_addr(msk, &msk->pm.remote, true); + mptcp_pm_announce_addr(msk, &remote, true); mptcp_pm_nl_addr_send_ack(msk); } From f73c1194634506ab60af0debef04671fc431a435 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:42 -0800 Subject: [PATCH 5/7] mptcp: add mibs counter for ignored incoming options The MPTCP in kernel path manager has some constraints on incoming addresses announce processing, so that in edge scenarios it can end-up dropping (ignoring) some of such announces. The above is not very limiting in practice since such scenarios are very uncommon and MPTCP will recover due to ADD_ADDR retransmissions. This patch adds a few MIB counters to account for such drop events to allow easier introspection of the critical scenarios. Fixes: f7efc7771eac ("mptcp: drop argument port from mptcp_pm_announce_addr") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/mib.c | 2 ++ net/mptcp/mib.h | 2 ++ net/mptcp/pm.c | 8 ++++++-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 3240b72271a7..7558802a1435 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -35,12 +35,14 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR), SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD), SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD), + SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP), SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX), SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX), SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX), SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX), SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX), SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR), + SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP), SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index ecd3d8b117e0..2966fcb6548b 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -28,12 +28,14 @@ enum linux_mptcp_mib_field { MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */ MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */ MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */ + MPTCP_MIB_ADDADDRDROP, /* Dropped incoming ADD_ADDR */ MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTSYNACKRX, /* Received a SYNACK MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTACKRX, /* Received an ACK MP_JOIN with a different port-number */ MPTCP_MIB_MISMATCHPORTSYNRX, /* Received a SYN MP_JOIN with a mismatched port-number */ MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */ MPTCP_MIB_RMADDR, /* Received RM_ADDR */ + MPTCP_MIB_RMADDRDROP, /* Dropped incoming RM_ADDR */ MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */ MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */ MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */ diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 696b2c4613a7..7bea318ac5f2 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -213,6 +213,8 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; + } else { + __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } spin_unlock_bh(&pm->lock); @@ -253,8 +255,10 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, mptcp_event_addr_removed(msk, rm_list->ids[i]); spin_lock_bh(&pm->lock); - mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED); - pm->rm_list_rx = *rm_list; + if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED)) + pm->rm_list_rx = *rm_list; + else + __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP); spin_unlock_bh(&pm->lock); } From 6ef84b1517e08f6c2fc105b798a9d21bf4caa6cb Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:43 -0800 Subject: [PATCH 6/7] selftests: mptcp: more robust signal race test The in kernel MPTCP PM implementation can process a single incoming add address option at any given time. In the mentioned test the server can surpass such limit. Let the setup cope with that allowing a faster add_addr retransmission. Fixes: a88c9e496937 ("mptcp: do not block subflows creation on errors") Fixes: f7efc7771eac ("mptcp: drop argument port from mptcp_pm_announce_addr") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/254 Reported-and-tested-by: Matthieu Baerts Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 10b3bd805ac6..0d6a71e7bb59 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -752,11 +752,17 @@ chk_add_nr() local mis_ack_nr=${8:-0} local count local dump_stats + local timeout + + timeout=`ip netns exec $ns1 sysctl -n net.mptcp.add_addr_timeout` printf "%-39s %s" " " "add" - count=`ip netns exec $ns2 nstat -as | grep MPTcpExtAddAddr | awk '{print $2}'` + count=`ip netns exec $ns2 nstat -as MPTcpExtAddAddr | grep MPTcpExtAddAddr | awk '{print $2}'` [ -z "$count" ] && count=0 - if [ "$count" != "$add_nr" ]; then + + # if the test configured a short timeout tolerate greater then expected + # add addrs options, due to retransmissions + if [ "$count" != "$add_nr" ] && [ "$timeout" -gt 1 -o "$count" -lt "$add_nr" ]; then echo "[fail] got $count ADD_ADDR[s] expected $add_nr" ret=1 dump_stats=1 @@ -1158,7 +1164,10 @@ signal_address_tests() ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags signal ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags signal ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags signal - run_tests $ns1 $ns2 10.0.1.1 + + # the peer could possibly miss some addr notification, allow retransmission + ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=1 + run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow chk_join_nr "signal addresses race test" 3 3 3 # the server will not signal the address terminating From e35f885b357d47e04380a2056d1b2cc3e6f4f24b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:44 -0800 Subject: [PATCH 7/7] selftests: mptcp: be more conservative with cookie MPJ limits Since commit 2843ff6f36db ("mptcp: remote addresses fullmesh"), an MPTCP client can attempt creating multiple MPJ subflow simultaneusly. In such scenario the server, when syncookies are enabled, could end-up accepting incoming MPJ syn even above the configured subflow limit, as the such limit can be enforced in a reliable way only after the subflow creation. In case of syncookie, only after the 3rd ack reception. As a consequence the related self-tests case sporadically fails, as it verify that the server always accept the expected number of MPJ syn. Address the issues relaxing the MPJ syn number constrain. Note that the check on the accepted number of MPJ 3rd ack still remains intact. Fixes: 2843ff6f36db ("mptcp: remote addresses fullmesh") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 0d6a71e7bb59..0c8a2a20b96c 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -660,6 +660,7 @@ chk_join_nr() local ack_nr=$4 local count local dump_stats + local with_cookie printf "%02u %-36s %s" "$TEST_COUNT" "$msg" "syn" count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinSynRx | awk '{print $2}'` @@ -673,12 +674,20 @@ chk_join_nr() fi echo -n " - synack" + with_cookie=`ip netns exec $ns2 sysctl -n net.ipv4.tcp_syncookies` count=`ip netns exec $ns2 nstat -as | grep MPTcpExtMPJoinSynAckRx | awk '{print $2}'` [ -z "$count" ] && count=0 if [ "$count" != "$syn_ack_nr" ]; then - echo "[fail] got $count JOIN[s] synack expected $syn_ack_nr" - ret=1 - dump_stats=1 + # simult connections exceeding the limit with cookie enabled could go up to + # synack validation as the conn limit can be enforced reliably only after + # the subflow creation + if [ "$with_cookie" = 2 ] && [ "$count" -gt "$syn_ack_nr" ] && [ "$count" -le "$syn_nr" ]; then + echo -n "[ ok ]" + else + echo "[fail] got $count JOIN[s] synack expected $syn_ack_nr" + ret=1 + dump_stats=1 + fi else echo -n "[ ok ]" fi