diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 62cdd2bcff9d..f994e7f45f7b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -179,6 +179,35 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, return mptcp_try_coalesce((struct sock *)msk, to, from); } +/* "inspired" by tcp_rcvbuf_grow(), main difference: + * - mptcp does not maintain a msk-level window clamp + * - returns true when the receive buffer is actually updated + */ +static bool mptcp_rcvbuf_grow(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + const struct net *net = sock_net(sk); + int rcvwin, rcvbuf, cap; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || + (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + return false; + + rcvwin = msk->rcvq_space.space << 1; + + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) + rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq; + + cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); + + rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap); + if (rcvbuf > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + return true; + } + return false; +} + /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks @@ -292,6 +321,9 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) end: skb_condense(skb); skb_set_owner_r(skb, sk); + /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ + if (sk->sk_socket) + mptcp_rcvbuf_grow(sk); } static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, @@ -784,18 +816,10 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) return moved; } -static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk) -{ - if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf)) - WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf); -} - static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_sock *msk = mptcp_sk(sk); - __mptcp_rcvbuf_update(sk, ssk); - /* Wake-up the reader only for in-sequence data */ if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) sk->sk_data_ready(sk); @@ -2014,48 +2038,26 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - u64 rcvwin, grow; - int rcvbuf; + msk->rcvq_space.space = msk->rcvq_space.copied; + if (mptcp_rcvbuf_grow(sk)) { - rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; + /* Make subflows follow along. If we do not do this, we + * get drops at subflow level if skbs can't be moved to + * the mptcp rx queue fast enough (announced rcv_win can + * exceed ssk->sk_rcvbuf). + */ + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk; + bool slow; - grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); - - do_div(grow, msk->rcvq_space.space); - rcvwin += (grow << 1); - - rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin), - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - - if (rcvbuf > sk->sk_rcvbuf) { - u32 window_clamp; - - window_clamp = mptcp_win_from_space(sk, rcvbuf); - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); - - /* Make subflows follow along. If we do not do this, we - * get drops at subflow level if skbs can't be moved to - * the mptcp rx queue fast enough (announced rcv_win can - * exceed ssk->sk_rcvbuf). - */ - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk; - bool slow; - - ssk = mptcp_subflow_tcp_sock(subflow); - slow = lock_sock_fast(ssk); - WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); - WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp); - if (tcp_can_send_ack(ssk)) - tcp_cleanup_rbuf(ssk, 1); - unlock_sock_fast(ssk, slow); - } + ssk = mptcp_subflow_tcp_sock(subflow); + slow = lock_sock_fast(ssk); + tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied; + tcp_rcvbuf_grow(ssk); + unlock_sock_fast(ssk, slow); } } - msk->rcvq_space.space = msk->rcvq_space.copied; new_measure: msk->rcvq_space.copied = 0; msk->rcvq_space.time = mstamp; @@ -2084,11 +2086,6 @@ static bool __mptcp_move_skbs(struct sock *sk) if (list_empty(&msk->conn_list)) return false; - /* verify we can move any data from the subflow, eventually updating */ - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) - mptcp_for_each_subflow(msk, subflow) - __mptcp_rcvbuf_update(sk, subflow->tcp_sock); - subflow = list_first_entry(&msk->conn_list, struct mptcp_subflow_context, node); for (;;) { diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 371084a3fc22..52f9cfa4ce95 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -341,8 +341,8 @@ struct mptcp_sock { struct mptcp_pm_data pm; struct mptcp_sched_ops *sched; struct { - u32 space; /* bytes copied in last measurement window */ - u32 copied; /* bytes copied in this measurement window */ + int space; /* bytes copied in last measurement window */ + int copied; /* bytes copied in this measurement window */ u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space;