From 3764b0c5651e34ceb3e7d5c75b6fd8e7b72112ac Mon Sep 17 00:00:00 2001 From: Nicolas Rybowski Date: Thu, 10 Dec 2020 14:24:58 -0800 Subject: [PATCH 1/9] mptcp: attach subflow socket to parent cgroup It has been observed that the kernel sockets created for the subflows (except the first one) are not in the same cgroup as their parents. That's because the additional subflows are created by kernel workers. This is a problem with eBPF programs attached to the parent's cgroup won't be executed for the children. But also with any other features of CGroup linked to a sk. This patch fixes this behaviour. As the subflow sockets are created by the kernel, we can't use 'mem_cgroup_sk_alloc' because of the current context being the one of the kworker. This is why we have to do low level memcg manipulation, if required. Suggested-by: Matthieu Baerts Suggested-by: Paolo Abeni Acked-by: Matthieu Baerts Signed-off-by: Nicolas Rybowski Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index fefcaf497938..bf808f1fabe5 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1167,6 +1167,30 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, return err; } +static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) +{ +#ifdef CONFIG_SOCK_CGROUP_DATA + struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, + *child_skcd = &child->sk_cgrp_data; + + /* only the additional subflows created by kworkers have to be modified */ + if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != + cgroup_id(sock_cgroup_ptr(child_skcd))) { +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg = parent->sk_memcg; + + mem_cgroup_sk_free(child); + if (memcg && css_tryget(&memcg->css)) + child->sk_memcg = memcg; +#endif /* CONFIG_MEMCG */ + + cgroup_sk_free(child_skcd); + *child_skcd = *parent_skcd; + cgroup_sk_clone(child_skcd); + } +#endif /* CONFIG_SOCK_CGROUP_DATA */ +} + int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) { struct mptcp_subflow_context *subflow; @@ -1187,6 +1211,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) lock_sock(sf->sk); + /* the newly created socket has to be in the same cgroup as its parent */ + mptcp_attach_cgroup(sk, sf->sk); + /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. */ From 141694df6573b49aa4143c92556544b4b0bbda72 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 10 Dec 2020 14:24:59 -0800 Subject: [PATCH 2/9] mptcp: remove address when netlink flushes addrs When the PM netlink flushes the addresses, invoke the remove address function mptcp_nl_remove_subflow_and_signal_addr to remove the addresses and the subflows. Since this function should not be invoked under lock, move __flush_addrs out of the pernet->lock. Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 5151cfcd6962..9cc4eefaf080 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -867,13 +867,14 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) return ret; } -static void __flush_addrs(struct pm_nl_pernet *pernet) +static void __flush_addrs(struct net *net, struct list_head *list) { - while (!list_empty(&pernet->local_addr_list)) { + while (!list_empty(list)) { struct mptcp_pm_addr_entry *cur; - cur = list_entry(pernet->local_addr_list.next, + cur = list_entry(list->next, struct mptcp_pm_addr_entry, list); + mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr); list_del_rcu(&cur->list); kfree_rcu(cur, rcu); } @@ -890,11 +891,13 @@ static void __reset_counters(struct pm_nl_pernet *pernet) static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + LIST_HEAD(free_list); spin_lock_bh(&pernet->lock); - __flush_addrs(pernet); + list_splice_init(&pernet->local_addr_list, &free_list); __reset_counters(pernet); spin_unlock_bh(&pernet->lock); + __flush_addrs(sock_net(skb->sk), &free_list); return 0; } @@ -1156,10 +1159,12 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) struct net *net; list_for_each_entry(net, net_list, exit_list) { + struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); + /* net is removed from namespace list, can't race with * other modifiers */ - __flush_addrs(net_generic(net, pm_nl_pernet_id)); + __flush_addrs(net, &pernet->local_addr_list); } } From 6fe4ccdc3dabe3de573e27fb2684d925bd611458 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 10 Dec 2020 14:25:00 -0800 Subject: [PATCH 3/9] selftests: mptcp: add the flush addrs testcase This patch added the flush addrs testcase. In do_transfer, if the number of removing addresses is less than 8, use the del addr command to remove the addresses one by one. If the number is more than 8, use the flush addrs command to remove the addresses. Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_join.sh | 54 +++++++++++++------ 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 0eae628d1ffd..9aa9624cff97 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -264,27 +264,37 @@ do_transfer() cpid=$! if [ $rm_nr_ns1 -gt 0 ]; then - counter=1 - sleep 1 - - while [ $counter -le $rm_nr_ns1 ] - do - ip netns exec ${listener_ns} ./pm_nl_ctl del $counter + if [ $rm_nr_ns1 -lt 8 ]; then + counter=1 sleep 1 - let counter+=1 - done + + while [ $counter -le $rm_nr_ns1 ] + do + ip netns exec ${listener_ns} ./pm_nl_ctl del $counter + sleep 1 + let counter+=1 + done + else + sleep 1 + ip netns exec ${listener_ns} ./pm_nl_ctl flush + fi fi if [ $rm_nr_ns2 -gt 0 ]; then - counter=1 - sleep 1 - - while [ $counter -le $rm_nr_ns2 ] - do - ip netns exec ${connector_ns} ./pm_nl_ctl del $counter + if [ $rm_nr_ns2 -lt 8 ]; then + counter=1 sleep 1 - let counter+=1 - done + + while [ $counter -le $rm_nr_ns2 ] + do + ip netns exec ${connector_ns} ./pm_nl_ctl del $counter + sleep 1 + let counter+=1 + done + else + sleep 1 + ip netns exec ${connector_ns} ./pm_nl_ctl flush + fi fi wait $cpid @@ -663,6 +673,18 @@ chk_join_nr "remove subflows and signal" 3 3 3 chk_add_nr 1 1 chk_rm_nr 2 2 +# subflows and signal, flush +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 3 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +ip netns exec $ns2 ./pm_nl_ctl limits 1 3 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 0 8 8 slow +chk_join_nr "flush subflows and signal" 3 3 3 +chk_add_nr 1 1 +chk_rm_nr 2 2 + # subflow IPv6 reset ip netns exec $ns1 ./pm_nl_ctl limits 0 1 From ba34c3de71ced1582dee55f2fae8638a3655d957 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 10 Dec 2020 14:25:01 -0800 Subject: [PATCH 4/9] mptcp: use MPTCPOPT_HMAC_LEN macro Use the macro MPTCPOPT_HMAC_LEN instead of a constant in struct mptcp_options_received. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f6c3c686a34a..a5bc9599ae5c 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -119,7 +119,7 @@ struct mptcp_options_received { u32 token; u32 nonce; u64 thmac; - u8 hmac[20]; + u8 hmac[MPTCPOPT_HMAC_LEN]; u8 join_id; u8 use_map:1, dsn64:1, From ab82e996a1fa1b9ae514fa357d9ce8df62321157 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 10 Dec 2020 14:25:02 -0800 Subject: [PATCH 5/9] mptcp: hold mptcp socket before calling tcp_done When processing options from tcp reset path its possible that tcp_done(ssk) drops the last reference on the mptcp socket which results in use-after-free. Reviewed-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bf808f1fabe5..73e66a406d99 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -313,12 +313,17 @@ void mptcp_subflow_reset(struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; + /* must hold: tcp_done() could drop last reference on parent */ + sock_hold(sk); + tcp_set_state(ssk, TCP_CLOSE); tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && schedule_work(&mptcp_sk(sk)->work)) - sock_hold(sk); + return; /* worker will put sk for us */ + + sock_put(sk); } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) From 049fe386d35353398eee40ba8d76ab62cb5a24e5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 10 Dec 2020 14:25:03 -0800 Subject: [PATCH 6/9] tcp: parse mptcp options contained in reset packets Because TCP-level resets only affect the subflow, there is a MPTCP option to indicate that the MPTCP-level connection should be closed immediately without a mptcp-level fin exchange. This is the 'MPTCP fast close option'. It can be carried on ack segments or TCP resets. In the latter case, its needed to parse mptcp options also for reset packets so that MPTCP can act accordingly. Next patch will add receive side fastclose support in MPTCP. Acked-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- net/ipv4/tcp_input.c | 13 ++++++++----- net/ipv4/tcp_minisocks.c | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index a62fb7f8a1e3..b1a05f8b35f0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -611,7 +611,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); -void tcp_reset(struct sock *sk); +void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); void tcp_fin(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6ad3b5c38e7..48ee476aa031 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4218,10 +4218,13 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) } /* When we get a reset we do this. */ -void tcp_reset(struct sock *sk) +void tcp_reset(struct sock *sk, struct sk_buff *skb) { trace_tcp_receive_reset(sk); + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb); + /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: @@ -5604,7 +5607,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, &tp->last_oow_ack_time)) tcp_send_dupack(sk, skb); } else if (tcp_reset_check(sk, skb)) { - tcp_reset(sk); + tcp_reset(sk, skb); } goto discard; } @@ -5640,7 +5643,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } if (rst_seq_match) - tcp_reset(sk); + tcp_reset(sk, skb); else { /* Disable TFO if RST is out-of-order * and no data has been received @@ -6077,7 +6080,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, */ if (th->rst) { - tcp_reset(sk); + tcp_reset(sk, skb); goto discard; } @@ -6519,7 +6522,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - tcp_reset(sk); + tcp_reset(sk, skb); return 1; } } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 495dda2449fe..0055ae0a3bf8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -801,7 +801,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, req->rsk_ops->send_reset(sk, skb); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); - tcp_reset(sk); + tcp_reset(sk, skb); } if (!fastopen) { inet_csk_reqsk_queue_drop(sk, req); From 50c504a20a754ca37b5e1f4e660cd687769a7dca Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 10 Dec 2020 14:25:04 -0800 Subject: [PATCH 7/9] mptcp: parse and act on incoming FASTCLOSE option parse the MPTCP FASTCLOSE subtype. If provided key matches the local one, schedule the work queue to close (with tcp reset) all subflows. The MPTCP socket moves to closed state immediately. Reviewed-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 17 +++++++++++++++++ net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 4 ++++ 3 files changed, 54 insertions(+) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1ca60d9da3ef..5e7d7755d1a6 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -282,6 +282,16 @@ static void mptcp_parse_option(const struct sk_buff *skb, pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); break; + case MPTCPOPT_MP_FASTCLOSE: + if (opsize != TCPOLEN_MPTCP_FASTCLOSE) + break; + + ptr += 2; + mp_opt->rcvr_key = get_unaligned_be64(ptr); + ptr += 8; + mp_opt->fastclose = 1; + break; + default: break; } @@ -299,6 +309,7 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->mp_join = 0; mp_opt->add_addr = 0; mp_opt->ahmac = 0; + mp_opt->fastclose = 0; mp_opt->port = 0; mp_opt->rm_addr = 0; mp_opt->dss = 0; @@ -942,6 +953,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; + if (mp_opt.fastclose && + msk->local_key == mp_opt.rcvr_key) { + WRITE_ONCE(msk->rcv_fastclose, true); + mptcp_schedule_work((struct sock *)msk); + } + if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) { struct mptcp_addr_info addr; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2540d82742ac..cb8b7adf218a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2217,6 +2217,36 @@ static bool mptcp_check_close_timeout(const struct sock *sk) return true; } +static void mptcp_check_fastclose(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct sock *sk = &msk->sk.icsk_inet.sk; + + if (likely(!READ_ONCE(msk->rcv_fastclose))) + return; + + mptcp_token_destroy(msk); + + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(tcp_sk); + if (tcp_sk->sk_state != TCP_CLOSE) { + tcp_send_active_reset(tcp_sk, GFP_ATOMIC); + tcp_set_state(tcp_sk, TCP_CLOSE); + } + release_sock(tcp_sk); + } + + inet_sk_state_store(sk, TCP_CLOSE); + sk->sk_shutdown = SHUTDOWN_MASK; + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); + set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); + + mptcp_close_wake_up(sk); +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -2233,6 +2263,9 @@ static void mptcp_worker(struct work_struct *work) mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); + + mptcp_check_fastclose(msk); + if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(msk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a5bc9599ae5c..7cf9d110b85f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -23,6 +23,7 @@ #define OPTION_MPTCP_ADD_ADDR BIT(6) #define OPTION_MPTCP_ADD_ADDR6 BIT(7) #define OPTION_MPTCP_RM_ADDR BIT(8) +#define OPTION_MPTCP_FASTCLOSE BIT(9) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -58,6 +59,7 @@ #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24 #define TCPOLEN_MPTCP_PORT_LEN 4 #define TCPOLEN_MPTCP_RM_ADDR_BASE 4 +#define TCPOLEN_MPTCP_FASTCLOSE 12 /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) @@ -110,6 +112,7 @@ struct mptcp_options_received { u16 data_len; u16 mp_capable : 1, mp_join : 1, + fastclose : 1, dss : 1, add_addr : 1, rm_addr : 1, @@ -237,6 +240,7 @@ struct mptcp_sock { bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; + bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ spinlock_t join_list_lock; struct sock *ack_hint; From 1bc7327b5fea60328bf72cd702eca1defa2a5655 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 10 Dec 2020 14:25:05 -0800 Subject: [PATCH 8/9] mptcp: pm: simplify select_local_address() There is no need to unconditionally acquire the join list lock, we can simply splice the join list into the subflow list and traverse only the latter. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 9cc4eefaf080..a6d983d80576 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -135,7 +135,7 @@ select_local_address(const struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry, *ret = NULL; rcu_read_lock(); - spin_lock_bh(&msk->join_list_lock); + __mptcp_flush_join_list(msk); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; @@ -144,13 +144,11 @@ select_local_address(const struct pm_nl_pernet *pernet, * pending join */ if (entry->addr.family == ((struct sock *)msk)->sk_family && - !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && - !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) { + !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) { ret = entry; break; } } - spin_unlock_bh(&msk->join_list_lock); rcu_read_unlock(); return ret; } From 15e6ca974b14c2dc4221738ef81b23ef694c9160 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 10 Dec 2020 14:25:06 -0800 Subject: [PATCH 9/9] mptcp: let MPTCP create max size skbs Currently the xmit path of the MPTCP protocol creates smaller- than-max-size skbs, which is suboptimal for the performances. There are a few things to improve: - when coalescing to an existing skb, must clear the PUSH flag - tcp_build_frag() expect the available space as an argument. When coalescing is enable MPTCP already subtracted the to-be-coalesced skb len. We must increment said argument accordingly. Before: ./use_mptcp.sh netperf -H 127.0.0.1 -t TCP_STREAM [...] 131072 16384 16384 30.00 24414.86 After: ./use_mptcp.sh netperf -H 127.0.0.1 -t TCP_STREAM [...] 131072 16384 16384 30.05 28357.69 Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index cb8b7adf218a..b812aaae8044 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1256,6 +1256,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_ext *mpext = NULL; struct sk_buff *skb, *tail; bool can_collapse = false; + int size_bias = 0; int avail_size; size_t ret = 0; @@ -1277,10 +1278,12 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext = skb_ext_find(skb, SKB_EXT_MPTCP); can_collapse = (info->size_goal - skb->len > 0) && mptcp_skb_can_collapse_to(data_seq, skb, mpext); - if (!can_collapse) + if (!can_collapse) { TCP_SKB_CB(skb)->eor = 1; - else + } else { + size_bias = skb->len; avail_size = info->size_goal - skb->len; + } } /* Zero window and all data acked? Probe. */ @@ -1300,8 +1303,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, return 0; ret = info->limit - info->sent; - tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page, - dfrag->offset + info->sent, &ret); + tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags, + dfrag->page, dfrag->offset + info->sent, &ret); if (!tail) { tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); return -ENOMEM; @@ -1310,8 +1313,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* if the tail skb is still the cached one, collapsing really happened. */ if (skb == tail) { - WARN_ON_ONCE(!can_collapse); + TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; mpext->data_len += ret; + WARN_ON_ONCE(!can_collapse); WARN_ON_ONCE(zero_window_probe); goto out; }