From c4ee118561a0f74442439b7b5b486db1ac1ddfeb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jun 2022 10:17:33 -0700 Subject: [PATCH 1/3] tcp: fix over estimation in sk_forced_mem_schedule() sk_forced_mem_schedule() has a bug similar to ones fixed in commit 7c80b038d23e ("net: fix sk_wmem_schedule() and sk_rmem_schedule() errors") While this bug has little chance to trigger in old kernels, we need to fix it before the following patch. Fixes: d83769a580f1 ("tcp: fix possible deadlock in tcp_send_fin()") Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Shakeel Butt Reviewed-by: Wei Wang Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8ab98e1aca67..18c913a2347a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3362,11 +3362,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ void sk_forced_mem_schedule(struct sock *sk, int size) { - int amt; + int delta, amt; - if (size <= sk->sk_forward_alloc) + delta = size - sk->sk_forward_alloc; + if (delta <= 0) return; - amt = sk_mem_pages(size); + amt = sk_mem_pages(delta); sk->sk_forward_alloc += amt << PAGE_SHIFT; sk_memory_allocated_add(sk, amt); From 849b425cd091e1804af964b771761cfbefbafb43 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jun 2022 10:17:34 -0700 Subject: [PATCH 2/3] tcp: fix possible freeze in tx path under memory pressure Blamed commit only dealt with applications issuing small writes. Issue here is that we allow to force memory schedule for the sk_buff allocation, but we have no guarantee that sendmsg() is able to copy some payload in it. In this patch, I make sure the socket can use up to tcp_wmem[0] bytes. For example, if we consider tcp_wmem[0] = 4096 (default on x86), and initial skb->truesize being 1280, tcp_sendmsg() is able to copy up to 2816 bytes under memory pressure. Before this patch a sendmsg() sending more than 2816 bytes would either block forever (if persistent memory pressure), or return -EAGAIN. For bigger MTU networks, it is advised to increase tcp_wmem[0] to avoid sending too small packets. v2: deal with zero copy paths. Fixes: 8e4d980ac215 ("tcp: fix behavior for epoll edge trigger") Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Wei Wang Reviewed-by: Shakeel Butt Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0db0ec6b0a96..b278063a1724 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -951,6 +951,23 @@ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) return 0; } +static int tcp_wmem_schedule(struct sock *sk, int copy) +{ + int left; + + if (likely(sk_wmem_schedule(sk, copy))) + return copy; + + /* We could be in trouble if we have nothing queued. + * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] + * to guarantee some progress. + */ + left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued; + if (left > 0) + sk_forced_mem_schedule(sk, min(left, copy)); + return min(copy, sk->sk_forward_alloc); +} + static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, struct page *page, int offset, size_t *size) { @@ -986,7 +1003,11 @@ static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, tcp_mark_push(tp, skb); goto new_segment; } - if (tcp_downgrade_zcopy_pure(sk, skb) || !sk_wmem_schedule(sk, copy)) + if (tcp_downgrade_zcopy_pure(sk, skb)) + return NULL; + + copy = tcp_wmem_schedule(sk, copy); + if (!copy) return NULL; if (can_coalesce) { @@ -1334,8 +1355,11 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) copy = min_t(int, copy, pfrag->size - pfrag->offset); - if (tcp_downgrade_zcopy_pure(sk, skb) || - !sk_wmem_schedule(sk, copy)) + if (tcp_downgrade_zcopy_pure(sk, skb)) + goto wait_for_space; + + copy = tcp_wmem_schedule(sk, copy); + if (!copy) goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, @@ -1362,7 +1386,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; if (!skb_zcopy_pure(skb)) { - if (!sk_wmem_schedule(sk, copy)) + copy = tcp_wmem_schedule(sk, copy); + if (!copy) goto wait_for_space; } From f54755f6a11accb2db5ef17f8f75aad0875aefdc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jun 2022 10:17:34 -0700 Subject: [PATCH 3/3] tcp: fix possible freeze in tx path under memory pressure Blamed commit only dealt with applications issuing small writes. Issue here is that we allow to force memory schedule for the sk_buff allocation, but we have no guarantee that sendmsg() is able to copy some payload in it. In this patch, I make sure the socket can use up to tcp_wmem[0] bytes. For example, if we consider tcp_wmem[0] = 4096 (default on x86), and initial skb->truesize being 1280, tcp_sendmsg() is able to copy up to 2816 bytes under memory pressure. Before this patch a sendmsg() sending more than 2816 bytes would either block forever (if persistent memory pressure), or return -EAGAIN. For bigger MTU networks, it is advised to increase tcp_wmem[0] to avoid sending too small packets. v2: deal with zero copy paths. Fixes: 8e4d980ac215 ("tcp: fix behavior for epoll edge trigger") Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Wei Wang Reviewed-by: Shakeel Butt Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b278063a1724..6c3eab485249 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -968,6 +968,23 @@ static int tcp_wmem_schedule(struct sock *sk, int copy) return min(copy, sk->sk_forward_alloc); } +static int tcp_wmem_schedule(struct sock *sk, int copy) +{ + int left; + + if (likely(sk_wmem_schedule(sk, copy))) + return copy; + + /* We could be in trouble if we have nothing queued. + * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] + * to guarantee some progress. + */ + left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued; + if (left > 0) + sk_forced_mem_schedule(sk, min(left, copy)); + return min(copy, sk->sk_forward_alloc); +} + static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, struct page *page, int offset, size_t *size) {