From ea98b61bddf41e9a9308ff8f931e6077907b1587 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jan 2025 13:55:56 +0000 Subject: [PATCH 1/3] tcp: add drop_reason support to tcp_disordered_ack() Following patch is adding a new drop_reason to tcp_validate_incoming(). Change tcp_disordered_ack() to not return a boolean anymore, but a drop reason. Change its name to tcp_disordered_ack_check() Refactor tcp_validate_incoming() to ease the code review of the following patch, and reduce indentation level. This patch is a refactor, with no functional change. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250113135558.3180360-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 77 +++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4811727b8a02..24966dd3e49f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4450,34 +4450,38 @@ static u32 tcp_tsval_replay(const struct sock *sk) return inet_csk(sk)->icsk_rto * 1200 / HZ; } -static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) +static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk, + const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); const struct tcphdr *th = tcp_hdr(skb); - u32 seq = TCP_SKB_CB(skb)->seq; + SKB_DR_INIT(reason, TCP_RFC7323_PAWS); u32 ack = TCP_SKB_CB(skb)->ack_seq; + u32 seq = TCP_SKB_CB(skb)->seq; - return /* 1. Pure ACK with correct sequence number. */ - (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) && + /* 1. Is this not a pure ACK ? */ + if (!th->ack || seq != TCP_SKB_CB(skb)->end_seq) + return reason; - /* 2. ... and duplicate ACK. */ - ack == tp->snd_una && + /* 2. Is its sequence not the expected one ? */ + if (seq != tp->rcv_nxt) + return reason; - /* 3. ... and does not update window. */ - !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && + /* 3. Is this not a duplicate ACK ? */ + if (ack != tp->snd_una) + return reason; - /* 4. ... and sits in replay window. */ - (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= - tcp_tsval_replay(sk); -} + /* 4. Is this updating the window ? */ + if (tcp_may_update_window(tp, ack, seq, ntohs(th->window) << + tp->rx_opt.snd_wscale)) + return reason; -static inline bool tcp_paws_discard(const struct sock *sk, - const struct sk_buff *skb) -{ - const struct tcp_sock *tp = tcp_sk(sk); + /* 5. Is this not in the replay window ? */ + if ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > + tcp_tsval_replay(sk)) + return reason; - return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) && - !tcp_disordered_ack(sk, skb); + return 0; } /* Check segment sequence number for validity. @@ -5949,23 +5953,28 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, SKB_DR(reason); /* RFC1323: H1. Apply PAWS check first. */ - if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) && - tp->rx_opt.saw_tstamp && - tcp_paws_discard(sk, skb)) { - if (!th->rst) { - if (unlikely(th->syn)) - goto syn_challenge; - NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); - if (!tcp_oow_rate_limited(sock_net(sk), skb, - LINUX_MIB_TCPACKSKIPPEDPAWS, - &tp->last_oow_ack_time)) - tcp_send_dupack(sk, skb); - SKB_DR_SET(reason, TCP_RFC7323_PAWS); - goto discard; - } - /* Reset is accepted even if it did not pass PAWS. */ - } + if (!tcp_fast_parse_options(sock_net(sk), skb, th, tp) || + !tp->rx_opt.saw_tstamp || + tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW)) + goto step1; + reason = tcp_disordered_ack_check(sk, skb); + if (!reason) + goto step1; + /* Reset is accepted even if it did not pass PAWS. */ + if (th->rst) + goto step1; + if (unlikely(th->syn)) + goto syn_challenge; + + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + if (!tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDPAWS, + &tp->last_oow_ack_time)) + tcp_send_dupack(sk, skb); + goto discard; + +step1: /* Step 1: check sequence number */ reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); if (reason) { From 124c4c32e9f3b4d89f5be3342897adc8db5c27b8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jan 2025 13:55:57 +0000 Subject: [PATCH 2/3] tcp: add TCP_RFC7323_PAWS_ACK drop reason XPS can cause reorders because of the relaxed OOO conditions for pure ACK packets. For hosts not using RFS, what can happpen is that ACK packets are sent on behalf of the cpu processing NIC interrupts, selecting TX queue A for ACK packet P1. Then a subsequent sendmsg() can run on another cpu. TX queue selection uses the socket hash and can choose another queue B for packets P2 (with payload). If queue A is more congested than queue B, the ACK packet P1 could be sent on the wire after P2. A linux receiver when processing P1 (after P2) currently increments LINUX_MIB_PAWSESTABREJECTED (TcpExtPAWSEstab) and use TCP_RFC7323_PAWS drop reason. It might also send a DUPACK if not rate limited. In order to better understand this pattern, this patch adds a new drop_reason : TCP_RFC7323_PAWS_ACK. For old ACKS like these, we no longer increment LINUX_MIB_PAWSESTABREJECTED and no longer sends a DUPACK, keeping credit for other more interesting DUPACK. perf record -e skb:kfree_skb -a perf script ... swapper 0 [148] 27475.438637: skb:kfree_skb: ... location=tcp_validate_incoming+0x4f0 reason: TCP_RFC7323_PAWS_ACK swapper 0 [208] 27475.438706: skb:kfree_skb: ... location=tcp_validate_incoming+0x4f0 reason: TCP_RFC7323_PAWS_ACK swapper 0 [208] 27475.438908: skb:kfree_skb: ... location=tcp_validate_incoming+0x4f0 reason: TCP_RFC7323_PAWS_ACK swapper 0 [148] 27475.439010: skb:kfree_skb: ... location=tcp_validate_incoming+0x4f0 reason: TCP_RFC7323_PAWS_ACK swapper 0 [148] 27475.439214: skb:kfree_skb: ... location=tcp_validate_incoming+0x4f0 reason: TCP_RFC7323_PAWS_ACK swapper 0 [208] 27475.439286: skb:kfree_skb: ... location=tcp_validate_incoming+0x4f0 reason: TCP_RFC7323_PAWS_ACK ... Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250113135558.3180360-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 5 +++++ net/ipv4/tcp_input.c | 10 +++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 3a6602f37978..28555109f9bd 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -36,6 +36,7 @@ FN(TCP_OVERWINDOW) \ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ + FN(TCP_RFC7323_PAWS_ACK) \ FN(TCP_OLD_SEQUENCE) \ FN(TCP_INVALID_SEQUENCE) \ FN(TCP_INVALID_ACK_SEQUENCE) \ @@ -259,6 +260,10 @@ enum skb_drop_reason { * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED */ SKB_DROP_REASON_TCP_RFC7323_PAWS, + /** + * @SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK: PAWS check, old ACK packet. + */ + SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ SKB_DROP_REASON_TCP_OLD_SEQUENCE, /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 24966dd3e49f..dc0e88bcc535 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4465,7 +4465,9 @@ static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk, /* 2. Is its sequence not the expected one ? */ if (seq != tp->rcv_nxt) - return reason; + return before(seq, tp->rcv_nxt) ? + SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK : + reason; /* 3. Is this not a duplicate ACK ? */ if (ack != tp->snd_una) @@ -5967,6 +5969,12 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (unlikely(th->syn)) goto syn_challenge; + /* Old ACK are common, do not change PAWSESTABREJECTED + * and do not send a dupack. + */ + if (reason == SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK) + goto discard; + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDPAWS, From d16b34479064fcb8dbb66a72308b5034be819161 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jan 2025 13:55:58 +0000 Subject: [PATCH 3/3] tcp: add LINUX_MIB_PAWS_OLD_ACK SNMP counter Prior patch in the series added TCP_RFC7323_PAWS_ACK drop reason. This patch adds the corresponding SNMP counter, for folks using nstat instead of tracing for TCP diagnostics. nstat -az | grep PAWSOldAck Suggested-by: Neal Cardwell Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Jason Xing Tested-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250113135558.3180360-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 1 + include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_input.c | 7 ++++--- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 28555109f9bd..ed864934e20b 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -262,6 +262,7 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_RFC7323_PAWS, /** * @SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK: PAWS check, old ACK packet. + * Corresponds to LINUX_MIB_PAWS_OLD_ACK. */ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 2e75674e7d4f..848c7784e684 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -186,6 +186,7 @@ enum LINUX_MIB_TIMEWAITKILLED, /* TimeWaitKilled */ LINUX_MIB_PAWSACTIVEREJECTED, /* PAWSActiveRejected */ LINUX_MIB_PAWSESTABREJECTED, /* PAWSEstabRejected */ + LINUX_MIB_PAWS_OLD_ACK, /* PAWSOldAck */ LINUX_MIB_DELAYEDACKS, /* DelayedACKs */ LINUX_MIB_DELAYEDACKLOCKED, /* DelayedACKLocked */ LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 40053a02bae1..affd21a0f572 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -189,6 +189,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), + SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dc0e88bcc535..eb82e01da911 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5969,12 +5969,13 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (unlikely(th->syn)) goto syn_challenge; - /* Old ACK are common, do not change PAWSESTABREJECTED + /* Old ACK are common, increment PAWS_OLD_ACK * and do not send a dupack. */ - if (reason == SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK) + if (reason == SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWS_OLD_ACK); goto discard; - + } NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDPAWS,