From 10a11861943902fda74f37f456b45183b2bca270 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Tue, 21 Oct 2025 04:35:42 +0300 Subject: [PATCH 1/2] xfrm: Refactor xfrm_input lock to reduce contention with RSS With newer NICs like mlx5 supporting RSS for IPsec crypto offload, packets for a single Security Association (SA) are scattered across multiple CPU cores for parallel processing. The xfrm_state spinlock (x->lock) is held for each packet during xfrm processing. When multiple connections or flows share the same SA, this parallelism causes high lock contention on x->lock, creating a performance bottleneck and limiting scalability. The original xfrm_input() function exacerbated this issue by releasing and immediately re-acquiring x->lock. For hardware crypto offload paths, this unlock/relock sequence is unnecessary and introduces significant overhead. This patch refactors the function to relocate the type_offload->input_tail call for the offload path, performing all necessary work while continuously holding the lock. This reordering is safe, since packets which don't pass the checks below will still fail them with the new code. Performance testing with iperf using multiple parallel streams over a single IPsec SA shows significant improvement in throughput as the number of queues (and thus CPU cores) increases: +-----------+---------------+--------------+-----------------+ | RX queues | Before (Gbps) | After (Gbps) | Improvement (%) | +-----------+---------------+--------------+-----------------+ | 2 | 32.3 | 34.4 | 6.5 | | 4 | 34.4 | 40.0 | 16.3 | | 6 | 24.5 | 38.3 | 56.3 | | 8 | 23.1 | 38.3 | 65.8 | | 12 | 18.1 | 29.9 | 65.2 | | 16 | 16.0 | 25.2 | 57.5 | +-----------+---------------+--------------+-----------------+ Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index c9ddef869aa5..257935cbd221 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -505,6 +505,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) async = 1; dev_put(skb->dev); seq = XFRM_SKB_CB(skb)->seq.input.low; + spin_lock(&x->lock); goto resume; } /* GRO call */ @@ -541,6 +542,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } + + nexthdr = x->type_offload->input_tail(x, skb); } goto lock; @@ -638,11 +641,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop_unlock; } - spin_unlock(&x->lock); - if (xfrm_tunnel_check(skb, x, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); - goto drop; + goto drop_unlock; } seq_hi = htonl(xfrm_replay_seqhi(x, seq)); @@ -650,9 +651,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_SKB_CB(skb)->seq.input.low = seq; XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; - if (crypto_done) { - nexthdr = x->type_offload->input_tail(x, skb); - } else { + if (!crypto_done) { + spin_unlock(&x->lock); dev_hold(skb->dev); nexthdr = x->type->input(x, skb); @@ -660,9 +660,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) return 0; dev_put(skb->dev); + spin_lock(&x->lock); } resume: - spin_lock(&x->lock); if (nexthdr < 0) { if (nexthdr == -EBADMSG) { xfrm_audit_state_icvfail(x, skb, From b427c0c3bc40cca268a5d54a1cdf6166cb1360e2 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Tue, 21 Oct 2025 04:35:43 +0300 Subject: [PATCH 2/2] xfrm: Skip redundant replay recheck for the hardware offload path The xfrm_replay_recheck() function was introduced to handle the issues arising from asynchronous crypto algorithms. The crypto offload path is now effectively synchronous, as it holds the state lock throughout its operation. This eliminates the race condition, making the recheck an unnecessary overhead. This patch improves performance by skipping the redundant call when crypto_done is true. Additionally, the sequence number assignment is moved to an earlier point in the function. This improves performance by reducing lock contention and places the logic at a more appropriate point, as the full sequence number (including the higher-order bits) can be determined as soon as the packet is received. Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 257935cbd221..4ed346e682c7 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -546,7 +546,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) nexthdr = x->type_offload->input_tail(x, skb); } - goto lock; + goto process; } family = XFRM_SPI_SKB_CB(skb)->family; @@ -614,7 +614,12 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } -lock: +process: + seq_hi = htonl(xfrm_replay_seqhi(x, seq)); + + XFRM_SKB_CB(skb)->seq.input.low = seq; + XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; + spin_lock(&x->lock); if (unlikely(x->km.state != XFRM_STATE_VALID)) { @@ -646,11 +651,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop_unlock; } - seq_hi = htonl(xfrm_replay_seqhi(x, seq)); - - XFRM_SKB_CB(skb)->seq.input.low = seq; - XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; - if (!crypto_done) { spin_unlock(&x->lock); dev_hold(skb->dev); @@ -676,7 +676,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) /* only the first xfrm gets the encap type */ encap_type = 0; - if (xfrm_replay_recheck(x, skb, seq)) { + if (!crypto_done && xfrm_replay_recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; }