From 0eb4e7ee1655b7ffd3204a35d77b809d42613cb9 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 7 Mar 2022 12:44:31 -0800 Subject: [PATCH 1/9] mptcp: add tracepoint in mptcp_sendmsg_frag The tracepoint in get_mapping_status() only dumped the incoming mpext fields. This patch added a new tracepoint in mptcp_sendmsg_frag() to dump the outgoing mpext too. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/trace/events/mptcp.h | 4 ++++ net/mptcp/protocol.c | 1 + 2 files changed, 5 insertions(+) diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h index 6bf43176f14c..f8e28e686c65 100644 --- a/include/trace/events/mptcp.h +++ b/include/trace/events/mptcp.h @@ -115,6 +115,10 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext, __entry->csum_reqd) ); +DEFINE_EVENT(mptcp_dump_mpext, mptcp_sendmsg_frag, + TP_PROTO(struct mptcp_ext *mpext), + TP_ARGS(mpext)); + DEFINE_EVENT(mptcp_dump_mpext, get_mapping_status, TP_PROTO(struct mptcp_ext *mpext), TP_ARGS(mpext)); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1c72f25f083e..36a7d33f670a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1356,6 +1356,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, out: if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); + trace_mptcp_sendmsg_frag(mpext); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; return copy; } From ea56dcb43c2054426c5a8d7befa8d993a060b26b Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 7 Mar 2022 12:44:32 -0800 Subject: [PATCH 2/9] mptcp: use MPTCP_SUBFLOW_NODATA Set subflow->data_avail with the enum value MPTCP_SUBFLOW_NODATA, instead of using 0 directly. Reviewed-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 45c004f87f5a..bb09a008e733 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1104,7 +1104,7 @@ static bool subflow_check_data_avail(struct sock *ssk) struct sk_buff *skb; if (!skb_peek(&ssk->sk_receive_queue)) - WRITE_ONCE(subflow->data_avail, 0); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); if (subflow->data_avail) return true; @@ -1169,7 +1169,7 @@ static bool subflow_check_data_avail(struct sock *ssk) subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; tcp_send_active_reset(ssk, GFP_ATOMIC); - WRITE_ONCE(subflow->data_avail, 0); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); return true; } @@ -1182,7 +1182,7 @@ static bool subflow_check_data_avail(struct sock *ssk) subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMPTCP; tcp_send_active_reset(ssk, GFP_ATOMIC); - WRITE_ONCE(subflow->data_avail, 0); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); return false; } @@ -1204,7 +1204,7 @@ bool mptcp_subflow_data_available(struct sock *sk) if (subflow->map_valid && mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { subflow->map_valid = 0; - WRITE_ONCE(subflow->data_avail, 0); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); pr_debug("Done with mapping: seq=%u data_len=%u", subflow->map_subflow_seq, From 826d7bdca83328b101853b48ee6b5e9bb6a5f537 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Mon, 7 Mar 2022 12:44:33 -0800 Subject: [PATCH 3/9] selftests: mptcp: join: allow running -cCi Without this patch, no tests would be ran when launching: mptcp_join.sh -cCi In any order or a combination with 2 of these letters. The recommended way with getopt is first parse all options and then act. This allows to do some actions in priority, e.g. display the help menu and stop. But also some global variables changing the behaviour of this selftests -- like the ones behind -cCi options -- can be set before running the different tests. By doing that, we can also avoid long and unreadable regex. Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_join.sh | 67 ++++++++----------- 1 file changed, 28 insertions(+), 39 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 45c6e5f06916..309d06781ae7 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -16,7 +16,6 @@ capture=0 checksum=0 ip_mptcp=0 check_invert=0 -do_all_tests=1 init=0 TEST_COUNT=0 @@ -2293,84 +2292,66 @@ usage() exit ${ret} } -for arg in "$@"; do - # check for "capture/checksum" args before launching tests - if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"c"[0-9a-zA-Z]*$ ]]; then - capture=1 - fi - if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"C"[0-9a-zA-Z]*$ ]]; then - checksum=1 - fi - if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"i"[0-9a-zA-Z]*$ ]]; then - ip_mptcp=1 - fi - - # exception for the capture/checksum/ip_mptcp options, the rest means: a part of the tests - if [ "${arg}" != "-c" ] && [ "${arg}" != "-C" ] && [ "${arg}" != "-i" ]; then - do_all_tests=0 - fi -done - -if [ $do_all_tests -eq 1 ]; then - all_tests - exit $ret -fi +tests=() while getopts 'fesltra64bpkdmchzCSi' opt; do case $opt in f) - subflows_tests + tests+=(subflows_tests) ;; e) - subflows_error_tests + tests+=(subflows_error_tests) ;; s) - signal_address_tests + tests+=(signal_address_tests) ;; l) - link_failure_tests + tests+=(link_failure_tests) ;; t) - add_addr_timeout_tests + tests+=(add_addr_timeout_tests) ;; r) - remove_tests + tests+=(remove_tests) ;; a) - add_tests + tests+=(add_tests) ;; 6) - ipv6_tests + tests+=(ipv6_tests) ;; 4) - v4mapped_tests + tests+=(v4mapped_tests) ;; b) - backup_tests + tests+=(backup_tests) ;; p) - add_addr_ports_tests + tests+=(add_addr_ports_tests) ;; k) - syncookies_tests + tests+=(syncookies_tests) ;; S) - checksum_tests + tests+=(checksum_tests) ;; d) - deny_join_id0_tests + tests+=(deny_join_id0_tests) ;; m) - fullmesh_tests + tests+=(fullmesh_tests) ;; z) - fastclose_tests + tests+=(fastclose_tests) ;; c) + capture=1 ;; C) + checksum=1 ;; i) + ip_mptcp=1 ;; h) usage @@ -2381,4 +2362,12 @@ while getopts 'fesltra64bpkdmchzCSi' opt; do esac done +if [ ${#tests[@]} -eq 0 ]; then + all_tests +else + for subtests in "${tests[@]}"; do + "${subtests}" + done +fi + exit $ret From f98c2bca7b2bd3fd777e05bb9e94c969381b1de8 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Mon, 7 Mar 2022 12:44:34 -0800 Subject: [PATCH 4/9] selftests: mptcp: Rename wait function The "selftests: mptcp: improve 'fair usage on close' stability" commit changed that self test to check the TcpAttemptFails MIB instead of looking for TW sockets. The associated bash function wasn't renamed in that commit because of the merge conflicts it would cause, so this commit updates the function name as Paolo originally intended. Cc: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 309d06781ae7..d4769bc0d842 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1242,7 +1242,7 @@ chk_link_usage() fi } -wait_for_tw() +wait_attempt_fail() { local timeout_ms=$((timeout_poll * 1000)) local time=0 @@ -1361,7 +1361,7 @@ subflows_error_tests() TEST_COUNT=$((TEST_COUNT+1)) # mpj subflow will be in TW after the reset - wait_for_tw $ns2 + wait_attempt_fail $ns2 pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow wait From 6fa0174a7c8646fce7039b5a176b4f90b0ea513a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 Mar 2022 12:44:35 -0800 Subject: [PATCH 5/9] mptcp: more careful RM_ADDR generation The in-kernel MPTCP path manager, when processing the MPTCP_PM_CMD_FLUSH_ADDR command, generates RM_ADDR events for each known local address. While that is allowed by the RFC, it makes unpredictable the exact number of RM_ADDR generated when both ends flush the PM addresses. This change restricts the RM_ADDR generation to previously explicitly announced addresses, and adjust the expected results in a bunch of related self-tests. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 10 ++--- .../testing/selftests/net/mptcp/mptcp_join.sh | 42 ++++++++++++++++--- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 75a0a27547e6..91b77d1162cf 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1466,14 +1466,12 @@ static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, list_for_each_entry(entry, rm_list, list) { if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && - alist.nr < MPTCP_RM_IDS_MAX && - slist.nr < MPTCP_RM_IDS_MAX) { - alist.ids[alist.nr++] = entry->addr.id; + slist.nr < MPTCP_RM_IDS_MAX) slist.ids[slist.nr++] = entry->addr.id; - } else if (remove_anno_list_by_saddr(msk, &entry->addr) && - alist.nr < MPTCP_RM_IDS_MAX) { + + if (remove_anno_list_by_saddr(msk, &entry->addr) && + alist.nr < MPTCP_RM_IDS_MAX) alist.ids[alist.nr++] = entry->addr.id; - } } if (alist.nr) { diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index d4769bc0d842..02bab8a2d5a5 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1149,14 +1149,25 @@ chk_rm_nr() { local rm_addr_nr=$1 local rm_subflow_nr=$2 - local invert=${3:-""} + local invert + local simult local count local dump_stats local addr_ns=$ns1 local subflow_ns=$ns2 local extra_msg="" - if [[ $invert = "invert" ]]; then + shift 2 + while [ -n "$1" ]; do + [ "$1" = "invert" ] && invert=true + [ "$1" = "simult" ] && simult=true + shift + done + + if [ -z $invert ]; then + addr_ns=$ns1 + subflow_ns=$ns2 + elif [ $invert = "true" ]; then addr_ns=$ns2 subflow_ns=$ns1 extra_msg=" invert" @@ -1176,6 +1187,25 @@ chk_rm_nr() echo -n " - rmsf " count=`ip netns exec $subflow_ns nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}'` [ -z "$count" ] && count=0 + if [ -n "$simult" ]; then + local cnt=$(ip netns exec $addr_ns nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}') + local suffix + + # in case of simult flush, the subflow removal count on each side is + # unreliable + [ -z "$cnt" ] && cnt=0 + count=$((count + cnt)) + [ "$count" != "$rm_subflow_nr" ] && suffix="$count in [$rm_subflow_nr:$((rm_subflow_nr*2))]" + if [ $count -ge "$rm_subflow_nr" ] && \ + [ "$count" -le "$((rm_subflow_nr *2 ))" ]; then + echo "[ ok ] $suffix" + else + echo "[fail] got $count RM_SUBFLOW[s] expected in range [$rm_subflow_nr:$((rm_subflow_nr*2))]" + ret=1 + dump_stats=1 + fi + return + fi if [ "$count" != "$rm_subflow_nr" ]; then echo "[fail] got $count RM_SUBFLOW[s] expected $rm_subflow_nr" ret=1 @@ -1666,7 +1696,7 @@ remove_tests() run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow chk_join_nr "flush subflows and signal" 3 3 3 chk_add_nr 1 1 - chk_rm_nr 2 2 + chk_rm_nr 1 3 invert simult # subflows flush reset @@ -1677,7 +1707,7 @@ remove_tests() pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow chk_join_nr "flush subflows" 3 3 3 - chk_rm_nr 3 3 + chk_rm_nr 0 3 simult # addresses flush reset @@ -1689,7 +1719,7 @@ remove_tests() run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow chk_join_nr "flush addresses" 3 3 3 chk_add_nr 3 3 - chk_rm_nr 3 3 invert + chk_rm_nr 3 3 invert simult # invalid addresses flush reset @@ -1973,7 +2003,7 @@ add_addr_ports_tests() run_tests $ns1 $ns2 10.0.1.1 0 -8 -2 slow chk_join_nr "flush subflows and signal with port" 3 3 3 chk_add_nr 1 1 - chk_rm_nr 2 2 + chk_rm_nr 1 3 invert simult # multiple addresses with port reset From d045b9eb95a9b611c483897a69e7285aefdc66d7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 Mar 2022 12:44:36 -0800 Subject: [PATCH 6/9] mptcp: introduce implicit endpoints In some edge scenarios, an MPTCP subflows can use a local address mapped by a "implicit" endpoint created by the in-kernel path manager. Such endpoints presence can be confusing, as it's creation is hard to track and will prevent the later endpoint creation from the user-space using the same address. Define a new endpoint flag to mark implicit endpoints and allow the user-space to replace implicit them with user-provided data at endpoint creation time. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm_netlink.c | 61 +++++++++++++------ .../testing/selftests/net/mptcp/mptcp_join.sh | 4 +- 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index f106a3941cdf..9690efedb5fa 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -81,6 +81,7 @@ enum { #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) #define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) #define MPTCP_PM_ADDR_FLAG_FULLMESH (1 << 3) +#define MPTCP_PM_ADDR_FLAG_IMPLICIT (1 << 4) enum { MPTCP_PM_CMD_UNSPEC, diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 91b77d1162cf..10368a4f1c4a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -877,10 +877,18 @@ static bool address_use_port(struct mptcp_pm_addr_entry *entry) MPTCP_PM_ADDR_FLAG_SIGNAL; } +/* caller must ensure the RCU grace period is already elapsed */ +static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) +{ + if (entry->lsk) + sock_release(entry->lsk); + kfree(entry); +} + static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry) { - struct mptcp_pm_addr_entry *cur; + struct mptcp_pm_addr_entry *cur, *del_entry = NULL; unsigned int addr_max; int ret = -EINVAL; @@ -901,8 +909,22 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, list_for_each_entry(cur, &pernet->local_addr_list, list) { if (addresses_equal(&cur->addr, &entry->addr, address_use_port(entry) && - address_use_port(cur))) - goto out; + address_use_port(cur))) { + /* allow replacing the exiting endpoint only if such + * endpoint is an implicit one and the user-space + * did not provide an endpoint id + */ + if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) + goto out; + if (entry->addr.id) + goto out; + + pernet->addrs--; + entry->addr.id = cur->addr.id; + list_del_rcu(&cur->list); + del_entry = cur; + break; + } } if (!entry->addr.id) { @@ -938,6 +960,12 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, out: spin_unlock_bh(&pernet->lock); + + /* just replaced an existing entry, free it */ + if (del_entry) { + synchronize_rcu(); + __mptcp_pm_release_addr_entry(del_entry); + } return ret; } @@ -1036,7 +1064,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) entry->addr.id = 0; entry->addr.port = 0; entry->ifindex = 0; - entry->flags = 0; + entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; entry->lsk = NULL; ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); if (ret < 0) @@ -1249,6 +1277,11 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } + if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { + GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint"); + return -EINVAL; + } + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { GENL_SET_ERR_MSG(info, "can't allocate addr"); @@ -1333,11 +1366,12 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, } static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, - struct mptcp_addr_info *addr) + const struct mptcp_pm_addr_entry *entry) { - struct mptcp_sock *msk; - long s_slot = 0, s_num = 0; + const struct mptcp_addr_info *addr = &entry->addr; struct mptcp_rm_list list = { .nr = 0 }; + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; pr_debug("remove_id=%d", addr->id); @@ -1354,7 +1388,8 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, lock_sock(sk); remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr); - mptcp_pm_remove_anno_addr(msk, addr, remove_subflow); + mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && + !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); if (remove_subflow) mptcp_pm_remove_subflow(msk, &list); release_sock(sk); @@ -1367,14 +1402,6 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, return 0; } -/* caller must ensure the RCU grace period is already elapsed */ -static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) -{ - if (entry->lsk) - sock_release(entry->lsk); - kfree(entry); -} - static int mptcp_nl_remove_id_zero_address(struct net *net, struct mptcp_addr_info *addr) { @@ -1451,7 +1478,7 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) __clear_bit(entry->addr.id, pernet->id_bitmap); spin_unlock_bh(&pernet->lock); - mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), &entry->addr); + mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry); synchronize_rcu(); __mptcp_pm_release_addr_entry(entry); diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 02bab8a2d5a5..1e2e8dd9f0d6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1938,7 +1938,7 @@ backup_tests() run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow backup chk_join_nr "single address, backup" 1 1 1 chk_add_nr 1 1 - chk_prio_nr 1 0 + chk_prio_nr 1 1 # single address with port, backup reset @@ -1948,7 +1948,7 @@ backup_tests() run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow backup chk_join_nr "single address with port, backup" 1 1 1 chk_add_nr 1 1 - chk_prio_nr 1 0 + chk_prio_nr 1 1 } add_addr_ports_tests() From 4cf86ae84c718333928fd2d43168a1e359a28329 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 Mar 2022 12:44:37 -0800 Subject: [PATCH 7/9] mptcp: strict local address ID selection The address ID selection for MPJ subflows created in response to incoming ADD_ADDR option is currently unreliable: it happens at MPJ socket creation time, when the local address could be unknown. Additionally, if the no local endpoint is available for the local address, a new dummy endpoint is created, confusing the user-land. This change refactor the code to move the address ID selection inside the rebuild_header() helper, when the local address eventually selected by the route lookup is finally known. If the address used is not mapped by any endpoint - and thus can't be advertised/removed pick the id 0 instead of allocate a new endpoint. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 13 -------- net/mptcp/protocol.c | 3 ++ net/mptcp/protocol.h | 3 +- net/mptcp/subflow.c | 67 ++++++++++++++++++++++++++++++++++++------ 4 files changed, 63 insertions(+), 23 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 10368a4f1c4a..e090810bb35d 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -83,16 +83,6 @@ static bool addresses_equal(const struct mptcp_addr_info *a, return a->port == b->port; } -static bool address_zero(const struct mptcp_addr_info *addr) -{ - struct mptcp_addr_info zero; - - memset(&zero, 0, sizeof(zero)); - zero.family = addr->family; - - return addresses_equal(addr, &zero, true); -} - static void local_address(const struct sock_common *skc, struct mptcp_addr_info *addr) { @@ -1039,9 +1029,6 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) if (addresses_equal(&msk_local, &skc_local, false)) return 0; - if (address_zero(&skc_local)) - return 0; - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); rcu_read_lock(); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 36a7d33f670a..101aeebeb9eb 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -117,6 +117,9 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); subflow->request_mptcp = 1; + + /* This is the first subflow, always with id 0 */ + subflow->local_id_valid = 1; mptcp_sock_graft(msk->first, sk->sk_socket); return 0; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 9d0ee6cee07f..3c1a3036550f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -442,7 +442,8 @@ struct mptcp_subflow_context { rx_eof : 1, can_ack : 1, /* only after processing the remote a key */ disposable : 1, /* ctx can be free at ulp release time */ - stale : 1; /* unable to snd/rcv data, do not use for xmit */ + stale : 1, /* unable to snd/rcv data, do not use for xmit */ + local_id_valid : 1; /* local_id is correctly initialized */ enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bb09a008e733..aba260f547da 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -481,6 +481,51 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) mptcp_subflow_reset(sk); } +static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id) +{ + subflow->local_id = local_id; + subflow->local_id_valid = 1; +} + +static int subflow_chk_local_id(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + int err; + + if (likely(subflow->local_id_valid)) + return 0; + + err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); + if (err < 0) + return err; + + subflow_set_local_id(subflow, err); + return 0; +} + +static int subflow_rebuild_header(struct sock *sk) +{ + int err = subflow_chk_local_id(sk); + + if (unlikely(err < 0)) + return err; + + return inet_sk_rebuild_header(sk); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static int subflow_v6_rebuild_header(struct sock *sk) +{ + int err = subflow_chk_local_id(sk); + + if (unlikely(err < 0)) + return err; + + return inet6_sk_rebuild_header(sk); +} +#endif + struct request_sock_ops mptcp_subflow_request_sock_ops; static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; @@ -1398,13 +1443,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, get_random_bytes(&subflow->local_nonce, sizeof(u32)); } while (!subflow->local_nonce); - if (!local_id) { - err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk); - if (err < 0) - goto failed; - - local_id = err; - } + if (local_id) + subflow_set_local_id(subflow, local_id); mptcp_pm_get_flags_and_ifindex_by_id(sock_net(sk), local_id, &flags, &ifindex); @@ -1429,7 +1469,6 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; - subflow->local_id = local_id; subflow->remote_id = remote_id; subflow->request_join = 1; subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); @@ -1728,15 +1767,22 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->token = subflow_req->token; new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->idsn = subflow_req->idsn; + + /* this is the first subflow, id is always 0 */ + new_ctx->local_id_valid = 1; } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; new_ctx->fully_established = 1; new_ctx->backup = subflow_req->backup; - new_ctx->local_id = subflow_req->local_id; new_ctx->remote_id = subflow_req->remote_id; new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; + + /* the subflow req id is valid, fetched via subflow_check_req() + * and subflow_token_join_request() + */ + subflow_set_local_id(new_ctx, subflow_req->local_id); } } @@ -1789,6 +1835,7 @@ void __init mptcp_subflow_init(void) subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_specific.rebuild_header = subflow_rebuild_header; tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; @@ -1801,6 +1848,7 @@ void __init mptcp_subflow_init(void) subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; @@ -1808,6 +1856,7 @@ void __init mptcp_subflow_init(void) subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; subflow_v6m_specific.net_frag_header_len = 0; + subflow_v6m_specific.rebuild_header = subflow_rebuild_header; tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; From 69c6ce7b6ecad8ed6c1b785bfadf50159d9f1023 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 Mar 2022 12:44:38 -0800 Subject: [PATCH 8/9] selftests: mptcp: add implicit endpoint test case Ensure implicit endpoint are created when expected and that the user-space can update them Reviewed-by: Matthieu Baerts Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_join.sh | 120 +++++++++++++++++- tools/testing/selftests/net/mptcp/pm_nl_ctl.c | 7 + 2 files changed, 126 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 1e2e8dd9f0d6..ee435948d130 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -310,6 +310,21 @@ wait_rm_addr() done } +wait_mpj() +{ + local ns="${1}" + local cnt old_cnt + + old_cnt=$(ip netns exec ${ns} nstat -as | grep MPJoinAckRx | awk '{print $2}') + + local i + for i in $(seq 10); do + cnt=$(ip netns exec ${ns} nstat -as | grep MPJoinAckRx | awk '{print $2}') + [ "$cnt" = "${old_cnt}" ] || break + sleep 0.1 + done +} + pm_nl_set_limits() { local ns=$1 @@ -410,6 +425,80 @@ pm_nl_change_endpoint() fi } +pm_nl_check_endpoint() +{ + local line expected_line + local title="$1" + local msg="$2" + local ns=$3 + local addr=$4 + local _flags="" + local flags + local _port + local port + local dev + local _id + local id + + if [ -n "${title}" ]; then + printf "%03u %-36s %s" "${TEST_COUNT}" "${title}" "${msg}" + else + printf "%-${nr_blank}s %s" " " "${msg}" + fi + + shift 4 + while [ -n "$1" ]; do + if [ $1 = "flags" ]; then + _flags=$2 + [ ! -z $_flags ]; flags="flags $_flags" + shift + elif [ $1 = "dev" ]; then + [ ! -z $2 ]; dev="dev $1" + shift + elif [ $1 = "id" ]; then + _id=$2 + [ ! -z $_id ]; id="id $_id" + shift + elif [ $1 = "port" ]; then + _port=$2 + [ ! -z $_port ]; port=" port $_port" + shift + fi + + shift + done + + if [ -z "$id" ]; then + echo "[skip] bad test - missing endpoint id" + return + fi + + if [ $ip_mptcp -eq 1 ]; then + line=$(ip -n $ns mptcp endpoint show $id) + # the dump order is: address id flags port dev + expected_line="$addr" + [ -n "$addr" ] && expected_line="$expected_line $addr" + expected_line="$expected_line $id" + [ -n "$_flags" ] && expected_line="$expected_line ${_flags//","/" "}" + [ -n "$dev" ] && expected_line="$expected_line $dev" + [ -n "$port" ] && expected_line="$expected_line $port" + else + line=$(ip netns exec $ns ./pm_nl_ctl get $_id) + # the dump order is: id flags dev address port + expected_line="$id" + [ -n "$flags" ] && expected_line="$expected_line $flags" + [ -n "$dev" ] && expected_line="$expected_line $dev" + [ -n "$addr" ] && expected_line="$expected_line $addr" + [ -n "$_port" ] && expected_line="$expected_line $_port" + fi + if [ "$line" = "$expected_line" ]; then + echo "[ ok ]" + else + echo "[fail] expected '$expected_line' found '$line'" + ret=1 + fi +} + do_transfer() { listener_ns="$1" @@ -2269,6 +2358,30 @@ fastclose_tests() chk_rst_nr 1 1 invert } +implicit_tests() +{ + # userspace pm type prevents add_addr + reset + pm_nl_set_limits $ns1 2 2 + pm_nl_set_limits $ns2 2 2 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow & + + wait_mpj $ns1 + TEST_COUNT=$((TEST_COUNT + 1)) + pm_nl_check_endpoint "implicit EP" "creation" \ + $ns2 10.0.2.2 id 1 flags implicit + + pm_nl_add_endpoint $ns2 10.0.2.2 id 33 + pm_nl_check_endpoint "" "ID change is prevented" \ + $ns2 10.0.2.2 id 1 flags implicit + + pm_nl_add_endpoint $ns2 10.0.2.2 flags signal + pm_nl_check_endpoint "" "modif is allowed" \ + $ns2 10.0.2.2 id 1 flags signal + wait +} + all_tests() { subflows_tests @@ -2287,6 +2400,7 @@ all_tests() deny_join_id0_tests fullmesh_tests fastclose_tests + implicit_tests } # [$1: error message] @@ -2314,6 +2428,7 @@ usage() echo " -d deny_join_id0_tests" echo " -m fullmesh_tests" echo " -z fastclose_tests" + echo " -I implicit_tests" echo " -c capture pcap files" echo " -C enable data checksum" echo " -i use ip mptcp" @@ -2324,7 +2439,7 @@ usage() tests=() -while getopts 'fesltra64bpkdmchzCSi' opt; do +while getopts 'fesltra64bpkdmchzICSi' opt; do case $opt in f) tests+=(subflows_tests) @@ -2374,6 +2489,9 @@ while getopts 'fesltra64bpkdmchzCSi' opt; do z) tests+=(fastclose_tests) ;; + I) + tests+=(implicit_tests) + ;; c) capture=1 ;; diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 22a5ec1e128e..a75a68ad652e 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -436,6 +436,13 @@ static void print_addr(struct rtattr *attrs, int len) printf(","); } + if (flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { + printf("implicit"); + flags &= ~MPTCP_PM_ADDR_FLAG_IMPLICIT; + if (flags) + printf(","); + } + /* bump unknown flags, if any */ if (flags) printf("0x%x", flags); From 0dc626e5e853a966dfbb6ee6cd607e13a2acd5ae Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 7 Mar 2022 12:44:39 -0800 Subject: [PATCH 9/9] mptcp: add fullmesh flag check for adding address The fullmesh flag mustn't be used with the signal flag when adding an address. This patch added the necessary flags check for this case. Fixes: 73c762c1f07d ("mptcp: set fullmesh flag in pm_netlink") Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index e090810bb35d..800515fe5e1d 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1264,6 +1264,12 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } + if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL && + addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + GENL_SET_ERR_MSG(info, "flags mustn't have both signal and fullmesh"); + return -EINVAL; + } + if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint"); return -EINVAL;