Merge branch 'tcp-tcp_rcvbuf_grow-changes'

Eric Dumazet says:

====================
tcp: tcp_rcvbuf_grow() changes

First pach is minor and moves tcp_moderate_rcvbuf in appropriate group.

Second patch is another attempt to keep small sk->sk_rcvbuf for DC
(small RT) TCP flows for optimal performance.
====================

Link: https://patch.msgid.link/20251119084813.3684576-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski
2025-11-20 17:44:26 -08:00
7 changed files with 42 additions and 13 deletions

View File

@@ -673,6 +673,16 @@ tcp_moderate_rcvbuf - BOOLEAN
Default: 1 (enabled)
tcp_rcvbuf_low_rtt - INTEGER
rcvbuf autotuning can over estimate final socket rcvbuf, which
can lead to cache trashing for high throughput flows.
For small RTT flows (below tcp_rcvbuf_low_rtt usecs), we can relax
rcvbuf growth: Few additional ms to reach the final (and smaller)
rcvbuf is a good tradeoff.
Default : 1000 (1 ms)
tcp_mtu_probing - INTEGER
Controls TCP Packetization-Layer Path MTU Discovery. Takes three
values:

View File

@@ -102,7 +102,8 @@ u8 sysctl_tcp_app_win
u8 sysctl_tcp_frto tcp_enter_loss
u8 sysctl_tcp_nometrics_save TCP_LAST_ACK/tcp_update_metrics
u8 sysctl_tcp_no_ssthresh_metrics_save TCP_LAST_ACK/tcp_(update/init)_metrics
u8 sysctl_tcp_moderate_rcvbuf read_mostly read_mostly tcp_tso_should_defer(tx);tcp_rcv_space_adjust(rx)
u8 sysctl_tcp_moderate_rcvbuf read_mostly tcp_rcvbuf_grow()
u32 sysctl_tcp_rcvbuf_low_rtt read_mostly tcp_rcvbuf_grow()
u8 sysctl_tcp_tso_win_divisor read_mostly tcp_tso_should_defer(tcp_write_xmit)
u8 sysctl_tcp_workaround_signed_windows tcp_select_window
int sysctl_tcp_limit_output_bytes read_mostly tcp_small_queue_check(tcp_write_xmit)

View File

@@ -74,17 +74,18 @@ struct netns_ipv4 {
/* TXRX readonly hotpath cache lines */
__cacheline_group_begin(netns_ipv4_read_txrx);
u8 sysctl_tcp_moderate_rcvbuf;
__cacheline_group_end(netns_ipv4_read_txrx);
/* RX readonly hotpath cache line */
__cacheline_group_begin(netns_ipv4_read_rx);
u8 sysctl_tcp_moderate_rcvbuf;
u8 sysctl_ip_early_demux;
u8 sysctl_tcp_early_demux;
u8 sysctl_tcp_l3mdev_accept;
/* 3 bytes hole, try to pack */
int sysctl_tcp_reordering;
int sysctl_tcp_rmem[3];
int sysctl_tcp_rcvbuf_low_rtt;
__cacheline_group_end(netns_ipv4_read_rx);
struct inet_timewait_death_row tcp_death_row;

View File

@@ -1223,14 +1223,12 @@ static void __init netns_ipv4_struct_check(void)
sysctl_tcp_wmem);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
sysctl_ip_fwd_use_pmtu);
CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33);
/* TXRX readonly hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx,
sysctl_tcp_moderate_rcvbuf);
CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1);
/* RX readonly hotpath cache line */
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_moderate_rcvbuf);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_rcvbuf_low_rtt);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_ip_early_demux);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
@@ -1241,7 +1239,6 @@ static void __init netns_ipv4_struct_check(void)
sysctl_tcp_reordering);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_rmem);
CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22);
}
#endif

View File

@@ -1342,6 +1342,15 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_rcvbuf_low_rtt",
.data = &init_net.ipv4.sysctl_tcp_rcvbuf_low_rtt,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_INT_MAX,
},
{
.procname = "tcp_tso_win_divisor",
.data = &init_net.ipv4.sysctl_tcp_tso_win_divisor,

View File

@@ -896,6 +896,7 @@ void tcp_rcvbuf_grow(struct sock *sk, u32 newval)
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 rcvwin, rcvbuf, cap, oldval;
u32 rtt_threshold, rtt_us;
u64 grow;
oldval = tp->rcvq_space.space;
@@ -908,10 +909,19 @@ void tcp_rcvbuf_grow(struct sock *sk, u32 newval)
/* DRS is always one RTT late. */
rcvwin = newval << 1;
/* slow start: allow the sender to double its rate. */
grow = (u64)rcvwin * (newval - oldval);
do_div(grow, oldval);
rcvwin += grow << 1;
rtt_us = tp->rcv_rtt_est.rtt_us >> 3;
rtt_threshold = READ_ONCE(net->ipv4.sysctl_tcp_rcvbuf_low_rtt);
if (rtt_us < rtt_threshold) {
/* For small RTT, we set @grow to rcvwin * rtt_us/rtt_threshold.
* It might take few additional ms to reach 'line rate',
* but will avoid sk_rcvbuf inflation and poor cache use.
*/
grow = div_u64((u64)rcvwin * rtt_us, rtt_threshold);
} else {
/* slow start: allow the sender to double its rate. */
grow = div_u64(((u64)rcvwin << 1) * (newval - oldval), oldval);
}
rcvwin += grow;
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;

View File

@@ -3566,6 +3566,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_adv_win_scale = 1;
net->ipv4.sysctl_tcp_frto = 2;
net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
/* This limits the percentage of the congestion window which we
* will allow a single TSO frame to consume. Building TSO frames
* which are too large can cause TCP streams to be bursty.