Merge branch 'xsk-the-lost-bits-from-chapter-iii'

Alexander Lobakin says:

====================
xsk: the lost bits from Chapter III

Before introducing libeth_xdp, we need to add a couple more generic
helpers. Notably:

* 01: add generic loop unrolling hint helpers;
* 04: add helper to get both xdp_desc's DMA address and metadata
  pointer in one go, saving several cycles and hotpath object
  code size in drivers (especially when unrolling).

Bonus:

* 02, 03: convert two drivers which were using custom macros to
  generic unrolled_count() (trivial, no object code changes).
====================

Link: https://patch.msgid.link/20250206182630.3914318-1-aleksander.lobakin@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski
2025-02-10 17:54:45 -08:00
8 changed files with 141 additions and 26 deletions

View File

@@ -2,6 +2,7 @@
/* Copyright(c) 2018 Intel Corporation. */
#include <linux/bpf_trace.h>
#include <linux/unroll.h>
#include <net/xdp_sock_drv.h>
#include "i40e_txrx_common.h"
#include "i40e_xsk.h"
@@ -529,7 +530,8 @@ static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *des
dma_addr_t dma;
u32 i;
loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
unrolled_count(PKTS_PER_BATCH)
for (i = 0; i < PKTS_PER_BATCH; i++) {
u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]);
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);

View File

@@ -6,7 +6,7 @@
#include <linux/types.h>
/* This value should match the pragma in the loop_unrolled_for
/* This value should match the pragma in the unrolled_count()
* macro. Why 4? It is strictly empirical. It seems to be a good
* compromise between the advantage of having simultaneous outstanding
* reads to the DMA array that can hide each others latency and the
@@ -14,14 +14,6 @@
*/
#define PKTS_PER_BATCH 4
#ifdef __clang__
#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for
#elif __GNUC__ >= 8
#define loop_unrolled_for _Pragma("GCC unroll 4") for
#else
#define loop_unrolled_for for
#endif
struct i40e_ring;
struct i40e_vsi;
struct net_device;

View File

@@ -2,6 +2,7 @@
/* Copyright (c) 2019, Intel Corporation. */
#include <linux/bpf_trace.h>
#include <linux/unroll.h>
#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "ice.h"
@@ -989,7 +990,8 @@ static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring,
struct ice_tx_desc *tx_desc;
u32 i;
loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
unrolled_count(PKTS_PER_BATCH)
for (i = 0; i < PKTS_PER_BATCH; i++) {
dma_addr_t dma;
dma = xsk_buff_raw_get_dma(xsk_pool, descs[i].addr);

View File

@@ -7,14 +7,6 @@
#define PKTS_PER_BATCH 8
#ifdef __clang__
#define loop_unrolled_for _Pragma("clang loop unroll_count(8)") for
#elif __GNUC__ >= 8
#define loop_unrolled_for _Pragma("GCC unroll 8") for
#else
#define loop_unrolled_for for
#endif
struct ice_vsi;
#ifdef CONFIG_XDP_SOCKETS

View File

@@ -9,6 +9,50 @@
#include <linux/args.h>
#ifdef CONFIG_CC_IS_CLANG
#define __pick_unrolled(x, y) _Pragma(#x)
#elif CONFIG_GCC_VERSION >= 80000
#define __pick_unrolled(x, y) _Pragma(#y)
#else
#define __pick_unrolled(x, y) /* not supported */
#endif
/**
* unrolled - loop attributes to ask the compiler to unroll it
*
* Usage:
*
* #define BATCH 8
*
* unrolled_count(BATCH)
* for (u32 i = 0; i < BATCH; i++)
* // loop body without cross-iteration dependencies
*
* This is only a hint and the compiler is free to disable unrolling if it
* thinks the count is suboptimal and may hurt performance and/or hugely
* increase object code size.
* Not having any cross-iteration dependencies (i.e. when iter x + 1 depends
* on what iter x will do with variables) is not a strict requirement, but
* provides best performance and object code size.
* Available only on Clang and GCC 8.x onwards.
*/
/* Ask the compiler to pick an optimal unroll count, Clang only */
#define unrolled \
__pick_unrolled(clang loop unroll(enable), /* nothing */)
/* Unroll each @n iterations of the loop */
#define unrolled_count(n) \
__pick_unrolled(clang loop unroll_count(n), GCC unroll n)
/* Unroll the whole loop */
#define unrolled_full \
__pick_unrolled(clang loop unroll(full), GCC unroll 65534)
/* Never unroll the loop */
#define unrolled_none \
__pick_unrolled(clang loop unroll(disable), GCC unroll 1)
#define UNROLL(N, MACRO, args...) CONCATENATE(__UNROLL_, N)(MACRO, args)
#define __UNROLL_0(MACRO, args...)

View File

@@ -196,6 +196,23 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return xp_raw_get_data(pool, addr);
}
/**
* xsk_buff_raw_get_ctx - get &xdp_desc context
* @pool: XSk buff pool desc address belongs to
* @addr: desc address (from userspace)
*
* Wrapper for xp_raw_get_ctx() to be used in drivers, see its kdoc for
* details.
*
* Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata
* pointer, if it is present and valid (initialized to %NULL otherwise).
*/
static inline struct xdp_desc_ctx
xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr)
{
return xp_raw_get_ctx(pool, addr);
}
#define XDP_TXMD_FLAGS_VALID ( \
XDP_TXMD_FLAGS_TIMESTAMP | \
XDP_TXMD_FLAGS_CHECKSUM | \
@@ -207,20 +224,27 @@ xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta)
return !(meta->flags & ~XDP_TXMD_FLAGS_VALID);
}
static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
static inline struct xsk_tx_metadata *
__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data)
{
struct xsk_tx_metadata *meta;
if (!pool->tx_metadata_len)
return NULL;
meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len;
meta = data - pool->tx_metadata_len;
if (unlikely(!xsk_buff_valid_tx_metadata(meta)))
return NULL; /* no way to signal the error to the user */
return meta;
}
static inline struct xsk_tx_metadata *
xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
{
return __xsk_buff_get_metadata(pool, xp_raw_get_data(pool, addr));
}
static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
@@ -388,12 +412,25 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return NULL;
}
static inline struct xdp_desc_ctx
xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr)
{
return (struct xdp_desc_ctx){ };
}
static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta)
{
return false;
}
static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
static inline struct xsk_tx_metadata *
__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data)
{
return NULL;
}
static inline struct xsk_tx_metadata *
xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
{
return NULL;
}

View File

@@ -141,6 +141,14 @@ u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max);
bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count);
void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr);
dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr);
struct xdp_desc_ctx {
dma_addr_t dma;
struct xsk_tx_metadata *meta;
};
struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr);
static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb)
{
return xskb->dma;

View File

@@ -699,18 +699,56 @@ void xp_free(struct xdp_buff_xsk *xskb)
}
EXPORT_SYMBOL(xp_free);
static u64 __xp_raw_get_addr(const struct xsk_buff_pool *pool, u64 addr)
{
return pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
}
static void *__xp_raw_get_data(const struct xsk_buff_pool *pool, u64 addr)
{
return pool->addrs + addr;
}
void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
{
addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
return pool->addrs + addr;
return __xp_raw_get_data(pool, __xp_raw_get_addr(pool, addr));
}
EXPORT_SYMBOL(xp_raw_get_data);
dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
static dma_addr_t __xp_raw_get_dma(const struct xsk_buff_pool *pool, u64 addr)
{
addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
return (pool->dma_pages[addr >> PAGE_SHIFT] &
~XSK_NEXT_PG_CONTIG_MASK) +
(addr & ~PAGE_MASK);
}
dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
{
return __xp_raw_get_dma(pool, __xp_raw_get_addr(pool, addr));
}
EXPORT_SYMBOL(xp_raw_get_dma);
/**
* xp_raw_get_ctx - get &xdp_desc context
* @pool: XSk buff pool desc address belongs to
* @addr: desc address (from userspace)
*
* Helper for getting desc's DMA address and metadata pointer, if present.
* Saves one call on hotpath, double calculation of the actual address,
* and inline checks for metadata presence and sanity.
*
* Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata
* pointer, if it is present and valid (initialized to %NULL otherwise).
*/
struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr)
{
struct xdp_desc_ctx ret;
addr = __xp_raw_get_addr(pool, addr);
ret.dma = __xp_raw_get_dma(pool, addr);
ret.meta = __xsk_buff_get_metadata(pool, __xp_raw_get_data(pool, addr));
return ret;
}
EXPORT_SYMBOL(xp_raw_get_ctx);