From a42f3076648e0b507de9039f8085edcc10b35fb7 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 21 Oct 2024 17:14:03 +0200 Subject: [PATCH 1/4] mptcp: pm: send ACK on non-stale subflows If the subflow is considered as "staled", it is better to avoid it to send an ACK carrying an ADD_ADDR or RM_ADDR. Another subflow, if any, will then be selected. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241021-net-next-mptcp-misc-6-13-v1-1-1ef02746504a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index db586a5b3866..618289aac0ab 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -781,7 +781,7 @@ bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) { - struct mptcp_subflow_context *subflow; + struct mptcp_subflow_context *subflow, *alt = NULL; msk_owned_by_me(msk); lockdep_assert_held(&msk->pm.lock); @@ -792,10 +792,18 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) mptcp_for_each_subflow(msk, subflow) { if (__mptcp_subflow_active(subflow)) { - mptcp_pm_send_ack(msk, subflow, false, false); - break; + if (!subflow->stale) { + mptcp_pm_send_ack(msk, subflow, false, false); + return; + } + + if (!alt) + alt = subflow; } } + + if (alt) + mptcp_pm_send_ack(msk, alt, false, false); } int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, From 581c8cbfa934aaa555daa4e843242fcecc160f05 Mon Sep 17 00:00:00 2001 From: Gang Yan Date: Mon, 21 Oct 2024 17:14:04 +0200 Subject: [PATCH 2/4] mptcp: annotate data-races around subflow->fully_established We introduce the same handling for potential data races with the 'fully_established' flag in subflow as previously done for msk->fully_established. Additionally, we make a crucial change: convert the subflow's 'fully_established' from 'bit_field' to 'bool' type. This is necessary because methods for avoiding data races don't work well with 'bit_field'. Specifically, the 'READ_ONCE' needs to know the size of the variable being accessed, which is not supported in 'bit_field'. Also, 'test_bit' expect the address of 'bit_field'. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/516 Signed-off-by: Gang Yan Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241021-net-next-mptcp-misc-6-13-v1-2-1ef02746504a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/diag.c | 2 +- net/mptcp/options.c | 4 ++-- net/mptcp/protocol.c | 2 +- net/mptcp/protocol.h | 6 +++--- net/mptcp/subflow.c | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c index 2d3efb405437..02205f7994d7 100644 --- a/net/mptcp/diag.c +++ b/net/mptcp/diag.c @@ -47,7 +47,7 @@ static int subflow_get_info(struct sock *sk, struct sk_buff *skb) flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM; if (sf->request_bkup) flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC; - if (sf->fully_established) + if (READ_ONCE(sf->fully_established)) flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED; if (sf->conn_finished) flags |= MPTCP_SUBFLOW_FLAG_CONNECTED; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 370c3836b771..1603b3702e22 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -461,7 +461,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, return false; /* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */ - if (subflow->fully_established || snd_data_fin_enable || + if (READ_ONCE(subflow->fully_established) || snd_data_fin_enable || subflow->snd_isn != TCP_SKB_CB(skb)->seq || sk->sk_state != TCP_ESTABLISHED) return false; @@ -930,7 +930,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, /* here we can process OoO, in-window pkts, only in-sequence 4th ack * will make the subflow fully established */ - if (likely(subflow->fully_established)) { + if (likely(READ_ONCE(subflow->fully_established))) { /* on passive sockets, check for 3rd ack retransmission * note that msk is always set by subflow_syn_recv_sock() * for mp_join subflows diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1f5c63eb21f0..a6c9661a4c45 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3511,7 +3511,7 @@ static void schedule_3rdack_retransmission(struct sock *ssk) struct tcp_sock *tp = tcp_sk(ssk); unsigned long timeout; - if (mptcp_subflow_ctx(ssk)->fully_established) + if (READ_ONCE(mptcp_subflow_ctx(ssk)->fully_established)) return; /* reschedule with a timeout above RTT, as we must look only for drop */ diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 568a72702b08..a93e661ef5c4 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -513,7 +513,6 @@ struct mptcp_subflow_context { request_bkup : 1, mp_capable : 1, /* remote is MPTCP capable */ mp_join : 1, /* remote is JOINing */ - fully_established : 1, /* path validated */ pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, @@ -532,10 +531,11 @@ struct mptcp_subflow_context { is_mptfo : 1, /* subflow is doing TFO */ close_event_done : 1, /* has done the post-closed part */ mpc_drop : 1, /* the MPC option has been dropped in a rtx */ - __unused : 8; + __unused : 9; bool data_avail; bool scheduled; bool pm_listener; /* a listener managed by the kernel PM? */ + bool fully_established; /* path validated */ u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -780,7 +780,7 @@ static inline bool __tcp_can_send(const struct sock *ssk) static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow) { /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ - if (subflow->request_join && !subflow->fully_established) + if (subflow->request_join && !READ_ONCE(subflow->fully_established)) return false; return __tcp_can_send(mptcp_subflow_tcp_sock(subflow)); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 6170f2fff71e..860903e06422 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -800,7 +800,7 @@ void __mptcp_subflow_fully_established(struct mptcp_sock *msk, const struct mptcp_options_received *mp_opt) { subflow_set_remote_key(msk, subflow, mp_opt); - subflow->fully_established = 1; + WRITE_ONCE(subflow->fully_established, true); WRITE_ONCE(msk->fully_established, true); if (subflow->is_mptfo) @@ -2062,7 +2062,7 @@ static void subflow_ulp_clone(const struct request_sock *req, } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; - new_ctx->fully_established = 1; + WRITE_ONCE(new_ctx->fully_established, true); new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; new_ctx->request_bkup = subflow_req->request_bkup; From 5add80bfdc46f9ad6857c80e3af109177e59a280 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 21 Oct 2024 17:14:05 +0200 Subject: [PATCH 3/4] mptcp: implement mptcp_pm_connection_closed The MPTCP path manager event handler mptcp_pm_connection_closed interface has been added in the commit 1b1c7a0ef7f3 ("mptcp: Add path manager interface") but it was an empty function from then on. With such name, it sounds good to invoke mptcp_event with the MPTCP_EVENT_CLOSED event type from it. It also removes a bit of duplicated code. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241021-net-next-mptcp-misc-6-13-v1-3-1ef02746504a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 3 +++ net/mptcp/protocol.c | 6 ++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 620264c75dc2..16c336c51940 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -154,6 +154,9 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) void mptcp_pm_connection_closed(struct mptcp_sock *msk) { pr_debug("msk=%p\n", msk); + + if (msk->token) + mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); } void mptcp_pm_subflow_established(struct mptcp_sock *msk) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a6c9661a4c45..e978e05ec8d1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3139,8 +3139,7 @@ bool __mptcp_close(struct sock *sk, long timeout) sock_hold(sk); pr_debug("msk=%p state=%d\n", sk, sk->sk_state); - if (msk->token) - mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); + mptcp_pm_connection_closed(msk); if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); @@ -3206,8 +3205,7 @@ static int mptcp_disconnect(struct sock *sk, int flags) mptcp_stop_rtx_timer(sk); mptcp_stop_tout_timer(sk); - if (msk->token) - mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); + mptcp_pm_connection_closed(msk); /* msk->subflow is still intact, the following will not free the first * subflow From 46a3282b87b1f9a88534eba59ecf852b2a21289c Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 21 Oct 2024 17:14:06 +0200 Subject: [PATCH 4/4] mptcp: use "middlebox interference" RST when no DSS RFC8684 suggests use of "Middlebox interference (code 0x06)" in case of fully established subflow that carries data at TCP level with no DSS sub-option. This is generally the case when mpext is NULL or mpext->use_map is 0: use a dedicated value of 'mapping_status' and use it before closing the socket in subflow_check_data_avail(). Link: https://github.com/multipath-tcp/mptcp_net-next/issues/518 Signed-off-by: Davide Caratti Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241021-net-next-mptcp-misc-6-13-v1-4-1ef02746504a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 860903e06422..07352b15f145 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -971,7 +971,8 @@ enum mapping_status { MAPPING_EMPTY, MAPPING_DATA_FIN, MAPPING_DUMMY, - MAPPING_BAD_CSUM + MAPPING_BAD_CSUM, + MAPPING_NODSS }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) @@ -1128,8 +1129,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk, return MAPPING_EMPTY; } + /* If the required DSS has likely been dropped by a middlebox */ if (!subflow->map_valid) - return MAPPING_INVALID; + return MAPPING_NODSS; goto validate_seq; } @@ -1343,7 +1345,7 @@ static bool subflow_check_data_avail(struct sock *ssk) status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || - status == MAPPING_BAD_CSUM)) + status == MAPPING_BAD_CSUM || status == MAPPING_NODSS)) goto fallback; if (status != MAPPING_OK) @@ -1396,7 +1398,9 @@ static bool subflow_check_data_avail(struct sock *ssk) * subflow_error_report() will introduce the appropriate barriers */ subflow->reset_transient = 0; - subflow->reset_reason = MPTCP_RST_EMPTCP; + subflow->reset_reason = status == MAPPING_NODSS ? + MPTCP_RST_EMIDDLEBOX : + MPTCP_RST_EMPTCP; reset: WRITE_ONCE(ssk->sk_err, EBADMSG);