mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-22 12:05:09 -04:00
Merge branch 'tcp-tcp_rcvbuf_grow-changes'
Eric Dumazet says: ==================== tcp: tcp_rcvbuf_grow() changes First pach is minor and moves tcp_moderate_rcvbuf in appropriate group. Second patch is another attempt to keep small sk->sk_rcvbuf for DC (small RT) TCP flows for optimal performance. ==================== Link: https://patch.msgid.link/20251119084813.3684576-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
@@ -673,6 +673,16 @@ tcp_moderate_rcvbuf - BOOLEAN
|
||||
|
||||
Default: 1 (enabled)
|
||||
|
||||
tcp_rcvbuf_low_rtt - INTEGER
|
||||
rcvbuf autotuning can over estimate final socket rcvbuf, which
|
||||
can lead to cache trashing for high throughput flows.
|
||||
|
||||
For small RTT flows (below tcp_rcvbuf_low_rtt usecs), we can relax
|
||||
rcvbuf growth: Few additional ms to reach the final (and smaller)
|
||||
rcvbuf is a good tradeoff.
|
||||
|
||||
Default : 1000 (1 ms)
|
||||
|
||||
tcp_mtu_probing - INTEGER
|
||||
Controls TCP Packetization-Layer Path MTU Discovery. Takes three
|
||||
values:
|
||||
|
||||
@@ -102,7 +102,8 @@ u8 sysctl_tcp_app_win
|
||||
u8 sysctl_tcp_frto tcp_enter_loss
|
||||
u8 sysctl_tcp_nometrics_save TCP_LAST_ACK/tcp_update_metrics
|
||||
u8 sysctl_tcp_no_ssthresh_metrics_save TCP_LAST_ACK/tcp_(update/init)_metrics
|
||||
u8 sysctl_tcp_moderate_rcvbuf read_mostly read_mostly tcp_tso_should_defer(tx);tcp_rcv_space_adjust(rx)
|
||||
u8 sysctl_tcp_moderate_rcvbuf read_mostly tcp_rcvbuf_grow()
|
||||
u32 sysctl_tcp_rcvbuf_low_rtt read_mostly tcp_rcvbuf_grow()
|
||||
u8 sysctl_tcp_tso_win_divisor read_mostly tcp_tso_should_defer(tcp_write_xmit)
|
||||
u8 sysctl_tcp_workaround_signed_windows tcp_select_window
|
||||
int sysctl_tcp_limit_output_bytes read_mostly tcp_small_queue_check(tcp_write_xmit)
|
||||
|
||||
@@ -74,17 +74,18 @@ struct netns_ipv4 {
|
||||
|
||||
/* TXRX readonly hotpath cache lines */
|
||||
__cacheline_group_begin(netns_ipv4_read_txrx);
|
||||
u8 sysctl_tcp_moderate_rcvbuf;
|
||||
__cacheline_group_end(netns_ipv4_read_txrx);
|
||||
|
||||
/* RX readonly hotpath cache line */
|
||||
__cacheline_group_begin(netns_ipv4_read_rx);
|
||||
u8 sysctl_tcp_moderate_rcvbuf;
|
||||
u8 sysctl_ip_early_demux;
|
||||
u8 sysctl_tcp_early_demux;
|
||||
u8 sysctl_tcp_l3mdev_accept;
|
||||
/* 3 bytes hole, try to pack */
|
||||
int sysctl_tcp_reordering;
|
||||
int sysctl_tcp_rmem[3];
|
||||
int sysctl_tcp_rcvbuf_low_rtt;
|
||||
__cacheline_group_end(netns_ipv4_read_rx);
|
||||
|
||||
struct inet_timewait_death_row tcp_death_row;
|
||||
|
||||
@@ -1223,14 +1223,12 @@ static void __init netns_ipv4_struct_check(void)
|
||||
sysctl_tcp_wmem);
|
||||
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
|
||||
sysctl_ip_fwd_use_pmtu);
|
||||
CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33);
|
||||
|
||||
/* TXRX readonly hotpath cache lines */
|
||||
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx,
|
||||
sysctl_tcp_moderate_rcvbuf);
|
||||
CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1);
|
||||
|
||||
/* RX readonly hotpath cache line */
|
||||
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
|
||||
sysctl_tcp_moderate_rcvbuf);
|
||||
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
|
||||
sysctl_tcp_rcvbuf_low_rtt);
|
||||
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
|
||||
sysctl_ip_early_demux);
|
||||
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
|
||||
@@ -1241,7 +1239,6 @@ static void __init netns_ipv4_struct_check(void)
|
||||
sysctl_tcp_reordering);
|
||||
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
|
||||
sysctl_tcp_rmem);
|
||||
CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1342,6 +1342,15 @@ static struct ctl_table ipv4_net_table[] = {
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dou8vec_minmax,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_rcvbuf_low_rtt",
|
||||
.data = &init_net.ipv4.sysctl_tcp_rcvbuf_low_rtt,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_INT_MAX,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_tso_win_divisor",
|
||||
.data = &init_net.ipv4.sysctl_tcp_tso_win_divisor,
|
||||
|
||||
@@ -896,6 +896,7 @@ void tcp_rcvbuf_grow(struct sock *sk, u32 newval)
|
||||
const struct net *net = sock_net(sk);
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
u32 rcvwin, rcvbuf, cap, oldval;
|
||||
u32 rtt_threshold, rtt_us;
|
||||
u64 grow;
|
||||
|
||||
oldval = tp->rcvq_space.space;
|
||||
@@ -908,10 +909,19 @@ void tcp_rcvbuf_grow(struct sock *sk, u32 newval)
|
||||
/* DRS is always one RTT late. */
|
||||
rcvwin = newval << 1;
|
||||
|
||||
/* slow start: allow the sender to double its rate. */
|
||||
grow = (u64)rcvwin * (newval - oldval);
|
||||
do_div(grow, oldval);
|
||||
rcvwin += grow << 1;
|
||||
rtt_us = tp->rcv_rtt_est.rtt_us >> 3;
|
||||
rtt_threshold = READ_ONCE(net->ipv4.sysctl_tcp_rcvbuf_low_rtt);
|
||||
if (rtt_us < rtt_threshold) {
|
||||
/* For small RTT, we set @grow to rcvwin * rtt_us/rtt_threshold.
|
||||
* It might take few additional ms to reach 'line rate',
|
||||
* but will avoid sk_rcvbuf inflation and poor cache use.
|
||||
*/
|
||||
grow = div_u64((u64)rcvwin * rtt_us, rtt_threshold);
|
||||
} else {
|
||||
/* slow start: allow the sender to double its rate. */
|
||||
grow = div_u64(((u64)rcvwin << 1) * (newval - oldval), oldval);
|
||||
}
|
||||
rcvwin += grow;
|
||||
|
||||
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
|
||||
rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
|
||||
|
||||
@@ -3566,6 +3566,7 @@ static int __net_init tcp_sk_init(struct net *net)
|
||||
net->ipv4.sysctl_tcp_adv_win_scale = 1;
|
||||
net->ipv4.sysctl_tcp_frto = 2;
|
||||
net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
|
||||
net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
|
||||
/* This limits the percentage of the congestion window which we
|
||||
* will allow a single TSO frame to consume. Building TSO frames
|
||||
* which are too large can cause TCP streams to be bursty.
|
||||
|
||||
Reference in New Issue
Block a user