From c30d084960cf316c95fbf145d39974ce1ff7889c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 25 Sep 2025 18:00:07 +0200 Subject: [PATCH 1/3] xsk: avoid overwriting skb fields for multi-buffer traffic We are unnecessarily setting a bunch of skb fields per each processed descriptor, which is redundant for fragmented frames. Let us set these respective members for first fragment only. To address both paths that we have within xsk_build_skb(), move assignments onto xsk_set_destructor_arg() and rename it to xsk_skb_init_misc(). Signed-off-by: Maciej Fijalkowski Acked-by: Stanislav Fomichev Reviewed-by: Jason Xing Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250925160009.2474816-2-maciej.fijalkowski@intel.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 72e34bd2d925..01f258894fae 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -618,11 +618,16 @@ static void xsk_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } -static void xsk_set_destructor_arg(struct sk_buff *skb, u64 addr) +static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, + u64 addr) { BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb)); INIT_LIST_HEAD(&XSKCB(skb)->addrs_list); + skb->dev = xs->dev; + skb->priority = READ_ONCE(xs->sk.sk_priority); + skb->mark = READ_ONCE(xs->sk.sk_mark); XSKCB(skb)->num_descs = 0; + skb->destructor = xsk_destruct_skb; skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr; } @@ -673,7 +678,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, skb_reserve(skb, hr); - xsk_set_destructor_arg(skb, desc->addr); + xsk_skb_init_misc(skb, xs, desc->addr); } else { xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); if (!xsk_addr) @@ -757,7 +762,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, if (unlikely(err)) goto free_err; - xsk_set_destructor_arg(skb, desc->addr); + xsk_skb_init_misc(skb, xs, desc->addr); } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct xsk_addr_node *xsk_addr; @@ -826,14 +831,10 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) skb->skb_mstamp_ns = meta->request.launch_time; + xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); } } - skb->dev = dev; - skb->priority = READ_ONCE(xs->sk.sk_priority); - skb->mark = READ_ONCE(xs->sk.sk_mark); - skb->destructor = xsk_destruct_skb; - xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); xsk_inc_num_desc(skb); return skb; From 6b9c129c2f93df545248e26434720928f249ff2e Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 25 Sep 2025 18:00:08 +0200 Subject: [PATCH 2/3] xsk: remove @first_frag from xsk_build_skb() Instead of using auxiliary boolean that tracks if we are at first frag when gathering all elements of skb, same functionality can be achieved with checking if skb_shared_info::nr_frags is 0. Remove @first_frag but be careful around xsk_build_skb_zerocopy() and NULL the skb pointer when it failed so that common error path does not incorrectly interpret it during decision whether to call kfree_skb(). Signed-off-by: Maciej Fijalkowski Acked-by: Stanislav Fomichev Reviewed-by: Jason Xing Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250925160009.2474816-3-maciej.fijalkowski@intel.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 01f258894fae..f7e0d254a723 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -730,13 +730,13 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xsk_tx_metadata *meta = NULL; struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; - bool first_frag = false; int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); + skb = NULL; goto free_err; } } else { @@ -747,8 +747,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, len = desc->len; if (!skb) { - first_frag = true; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); tr = dev->needed_tailroom; skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); @@ -798,7 +796,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } - if (first_frag && desc->options & XDP_TX_METADATA) { + if (!skb_shinfo(skb)->nr_frags && desc->options & XDP_TX_METADATA) { if (unlikely(xs->pool->tx_metadata_len == 0)) { err = -EINVAL; goto free_err; @@ -840,7 +838,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, return skb; free_err: - if (first_frag && skb) + if (skb && !skb_shinfo(skb)->nr_frags) kfree_skb(skb); if (err == -EOVERFLOW) { From 30c3055f9c0d84a67b8fd723bdec9b1b52b3c695 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 25 Sep 2025 18:00:09 +0200 Subject: [PATCH 3/3] xsk: wrap generic metadata handling onto separate function xsk_build_skb() has gone wild with its size and one of the things we can do about it is to pull out a branch that takes care of metadata handling and make it a separate function. While at it, let us add metadata SW support for devices supporting IFF_TX_SKB_NO_LINEAR flag, that happen to have separate logic for building skb in xsk's generic xmit path. Acked-by: Stanislav Fomichev Reviewed-by: Jason Xing Signed-off-by: Maciej Fijalkowski Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250925160009.2474816-4-maciej.fijalkowski@intel.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 92 +++++++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 39 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index f7e0d254a723..7b0c68a70888 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -657,6 +657,45 @@ static void xsk_drop_skb(struct sk_buff *skb) xsk_consume_skb(skb); } +static int xsk_skb_metadata(struct sk_buff *skb, void *buffer, + struct xdp_desc *desc, struct xsk_buff_pool *pool, + u32 hr) +{ + struct xsk_tx_metadata *meta = NULL; + + if (unlikely(pool->tx_metadata_len == 0)) + return -EINVAL; + + meta = buffer - pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) + return -EINVAL; + + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { + if (unlikely(meta->request.csum_start + + meta->request.csum_offset + + sizeof(__sum16) > desc->len)) + return -EINVAL; + + skb->csum_start = hr + meta->request.csum_start; + skb->csum_offset = meta->request.csum_offset; + skb->ip_summed = CHECKSUM_PARTIAL; + + if (unlikely(pool->tx_sw_csum)) { + int err; + + err = skb_checksum_help(skb); + if (err) + return err; + } + } + + if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) + skb->skb_mstamp_ns = meta->request.launch_time; + xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); + + return 0; +} + static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { @@ -669,6 +708,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, int err, i; u64 addr; + addr = desc->addr; + buffer = xsk_buff_raw_get_data(pool, addr); + if (!skb) { hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); @@ -679,6 +721,11 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, skb_reserve(skb, hr); xsk_skb_init_misc(skb, xs, desc->addr); + if (desc->options & XDP_TX_METADATA) { + err = xsk_skb_metadata(skb, buffer, desc, pool, hr); + if (unlikely(err)) + return ERR_PTR(err); + } } else { xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); if (!xsk_addr) @@ -692,11 +739,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } - addr = desc->addr; len = desc->len; ts = pool->unaligned ? len : pool->chunk_size; - buffer = xsk_buff_raw_get_data(pool, addr); offset = offset_in_page(buffer); addr = buffer - pool->addrs; @@ -727,7 +772,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { - struct xsk_tx_metadata *meta = NULL; struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; int err; @@ -761,6 +805,12 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, goto free_err; xsk_skb_init_misc(skb, xs, desc->addr); + if (desc->options & XDP_TX_METADATA) { + err = xsk_skb_metadata(skb, buffer, desc, + xs->pool, hr); + if (unlikely(err)) + goto free_err; + } } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct xsk_addr_node *xsk_addr; @@ -795,42 +845,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, xsk_addr->addr = desc->addr; list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } - - if (!skb_shinfo(skb)->nr_frags && desc->options & XDP_TX_METADATA) { - if (unlikely(xs->pool->tx_metadata_len == 0)) { - err = -EINVAL; - goto free_err; - } - - meta = buffer - xs->pool->tx_metadata_len; - if (unlikely(!xsk_buff_valid_tx_metadata(meta))) { - err = -EINVAL; - goto free_err; - } - - if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { - if (unlikely(meta->request.csum_start + - meta->request.csum_offset + - sizeof(__sum16) > len)) { - err = -EINVAL; - goto free_err; - } - - skb->csum_start = hr + meta->request.csum_start; - skb->csum_offset = meta->request.csum_offset; - skb->ip_summed = CHECKSUM_PARTIAL; - - if (unlikely(xs->pool->tx_sw_csum)) { - err = skb_checksum_help(skb); - if (err) - goto free_err; - } - } - - if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) - skb->skb_mstamp_ns = meta->request.launch_time; - xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); - } } xsk_inc_num_desc(skb);