From 512bd0f9f926a05c724a9fd72bc4e14213845e01 Mon Sep 17 00:00:00 2001 From: Kevin Yang Date: Mon, 3 Jun 2024 21:30:53 +0000 Subject: [PATCH 1/2] tcp: derive delack_max with tcp_rto_min helper Rto_min now has multiple sources, ordered by preprecedence high to low: ip route option rto_min, icsk->icsk_rto_min. When derive delack_max from rto_min, we should not only use ip route option, but should use tcp_rto_min helper to get the correct rto_min. Signed-off-by: Kevin Yang Reviewed-by: Neal Cardwell Reviewed-by: Yuchung Cheng Reviewed-by: Eric Dumazet Reviewed-by: Tony Lu Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f97e098f18a5..090fb0c24599 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4163,16 +4163,9 @@ EXPORT_SYMBOL(tcp_connect); u32 tcp_delack_max(const struct sock *sk) { - const struct dst_entry *dst = __sk_dst_get(sk); - u32 delack_max = inet_csk(sk)->icsk_delack_max; + u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1; - if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) { - u32 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); - u32 delack_from_rto_min = max_t(int, 1, rto_min - 1); - - delack_max = min_t(u32, delack_max, delack_from_rto_min); - } - return delack_max; + return min(inet_csk(sk)->icsk_delack_max, delack_from_rto_min); } /* Send out a delayed ack, the caller does the policy checking From f086edef71be7174a16c1ed67ac65a085cda28b1 Mon Sep 17 00:00:00 2001 From: Kevin Yang Date: Mon, 3 Jun 2024 21:30:54 +0000 Subject: [PATCH 2/2] tcp: add sysctl_tcp_rto_min_us Adding a sysctl knob to allow user to specify a default rto_min at socket init time, other than using the hard coded 200ms default rto_min. Note that the rto_min route option has the highest precedence for configuring this setting, followed by the TCP_BPF_RTO_MIN socket option, followed by the tcp_rto_min_us sysctl. Signed-off-by: Kevin Yang Reviewed-by: Neal Cardwell Reviewed-by: Yuchung Cheng Reviewed-by: Eric Dumazet Reviewed-by: Tony Lu Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 13 +++++++++++++ include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 8 ++++++++ net/ipv4/tcp.c | 4 +++- net/ipv4/tcp_ipv4.c | 1 + 5 files changed, 26 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index bd50df6a5a42..6e99eccdb837 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1196,6 +1196,19 @@ tcp_pingpong_thresh - INTEGER Default: 1 +tcp_rto_min_us - INTEGER + Minimal TCP retransmission timeout (in microseconds). Note that the + rto_min route option has the highest precedence for configuring this + setting, followed by the TCP_BPF_RTO_MIN socket option, followed by + this tcp_rto_min_us sysctl. + + The recommended practice is to use a value less or equal to 200000 + microseconds. + + Possible Values: 1 - INT_MAX + + Default: 200000 + UDP variables ============= diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index c356c458b340..a91bb971f901 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -170,6 +170,7 @@ struct netns_ipv4 { u8 sysctl_tcp_sack; u8 sysctl_tcp_window_scaling; u8 sysctl_tcp_timestamps; + int sysctl_tcp_rto_min_us; u8 sysctl_tcp_recovery; u8 sysctl_tcp_thin_linear_timeouts; u8 sysctl_tcp_slow_start_after_idle; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d7892f34a15b..bb64c0ef092d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1503,6 +1503,14 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "tcp_rto_min_us", + .data = &init_net.ipv4.sysctl_tcp_rto_min_us, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, }; static __net_init int ipv4_sysctl_init_net(struct net *net) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5fa68e7f6ddb..fa43aaacd92b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -420,6 +420,7 @@ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + int rto_min_us; tp->out_of_order_queue = RB_ROOT; sk->tcp_rtx_queue = RB_ROOT; @@ -428,7 +429,8 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; - icsk->icsk_rto_min = TCP_RTO_MIN; + rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us); + icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us); icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3ef4b274c24b..3613e08ca794 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3502,6 +3502,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_shrink_window = 0; net->ipv4.sysctl_tcp_pingpong_thresh = 1; + net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); return 0; }