From 4fbfde4e272065943cbcf2d016f0679456cb4f75 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 23 May 2023 18:14:51 +0200 Subject: [PATCH 1/3] net: tcp: make the txhash available in TIME_WAIT sockets for IPv4 too Commit c67b85558ff2 ("ipv6: tcp: send consistent autoflowlabel in TIME_WAIT state") made the socket txhash also available in TIME_WAIT sockets but for IPv6 only. Make it available for IPv4 too as we'll use it in later commits. Signed-off-by: Antoine Tenart Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/tcp_minisocks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index dac0d62120e6..04fc328727e6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -303,6 +303,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_ts_offset = tp->tsoffset; tcptw->tw_last_oow_ack_time = 0; tcptw->tw_tx_delay = tp->tcp_tx_delay; + tw->tw_txhash = sk->sk_txhash; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -311,7 +312,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); - tw->tw_txhash = sk->sk_txhash; tw->tw_ipv6only = sk->sk_ipv6only; } #endif From c0a8966e2bc7d31f77a7246947ebc09c1ff06066 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 23 May 2023 18:14:52 +0200 Subject: [PATCH 2/3] net: ipv4: use consistent txhash in TIME_WAIT and SYN_RECV When using IPv4/TCP, skb->hash comes from sk->sk_txhash except in TIME_WAIT and SYN_RECV where it's not set in the reply skb from ip_send_unicast_reply. Those packets will have a mismatched hash with others from the same flow as their hashes will be 0. IPv6 does not have the same issue as the hash is set from the socket txhash in those cases. This commits sets the hash in the reply skb from ip_send_unicast_reply, which makes the IPv4 code behaving like IPv6. Signed-off-by: Antoine Tenart Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/ip.h | 2 +- net/ipv4/ip_output.c | 4 +++- net/ipv4/tcp_ipv4.c | 14 +++++++++----- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 8a3860a916dc..2982dd13cf16 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -286,7 +286,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, - unsigned int len, u64 transmit_time); + unsigned int len, u64 transmit_time, u32 txhash); #define IP_INC_STATS(net, field) SNMP_INC_STATS64((net)->mib.ip_statistics, field) #define __IP_INC_STATS(net, field) __SNMP_INC_STATS64((net)->mib.ip_statistics, field) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 553c740a6bfb..244fb9365d87 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1570,7 +1570,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, - unsigned int len, u64 transmit_time) + unsigned int len, u64 transmit_time, u32 txhash) { struct ip_options_data replyopts; struct ipcm_cookie ipc; @@ -1633,6 +1633,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; nskb->mono_delivery_time = !!transmit_time; + if (txhash) + skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4); ip_push_pending_frames(sk, &fl4); } out: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index af043f063a73..a50bd782f91f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -692,6 +692,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) u64 transmit_time = 0; struct sock *ctl_sk; struct net *net; + u32 txhash = 0; /* Never send a reset in response to a reset. */ if (th->rst) @@ -829,6 +830,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); xfrm_sk_clone_policy(ctl_sk, sk); + txhash = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_txhash : sk->sk_txhash; } else { ctl_sk->sk_mark = 0; ctl_sk->sk_priority = 0; @@ -837,7 +840,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, - transmit_time); + transmit_time, txhash); xfrm_sk_free_policy(ctl_sk); sock_net_set(ctl_sk, &init_net); @@ -859,7 +862,7 @@ static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, - int reply_flags, u8 tos) + int reply_flags, u8 tos, u32 txhash) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -935,7 +938,7 @@ static void tcp_v4_send_ack(const struct sock *sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, - transmit_time); + transmit_time, txhash); sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); @@ -955,7 +958,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, - tw->tw_tos + tw->tw_tos, + tw->tw_txhash ); inet_twsk_put(tw); @@ -988,7 +992,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 0, tcp_md5_do_lookup(sk, l3index, addr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, - ip_hdr(skb)->tos); + ip_hdr(skb)->tos, tcp_rsk(req)->txhash); } /* From 7016eb738651ed1dfeef2bbf266bc7dac734067d Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 23 May 2023 18:14:53 +0200 Subject: [PATCH 3/3] Documentation: net: net.core.txrehash is not specific to listening sockets The net.core.txrehash documentation mentions this knob is for listening sockets only, while sk_rethink_txhash can be called on SYN and RTO retransmits on all TCP sockets. Remove the listening socket part. Signed-off-by: Antoine Tenart Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- Documentation/admin-guide/sysctl/net.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 466c560b0c30..4877563241f3 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -386,8 +386,8 @@ Default : 0 (for compatibility reasons) txrehash -------- -Controls default hash rethink behaviour on listening socket when SO_TXREHASH -option is set to SOCK_TXREHASH_DEFAULT (i. e. not overridden by setsockopt). +Controls default hash rethink behaviour on socket when SO_TXREHASH option is set +to SOCK_TXREHASH_DEFAULT (i. e. not overridden by setsockopt). If set to 1 (default), hash rethink is performed on listening socket. If set to 0, hash rethink is not performed.