mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-10 15:13:44 -04:00
Merge branch 'xsk-the-lost-bits-from-chapter-iii'
Alexander Lobakin says: ==================== xsk: the lost bits from Chapter III Before introducing libeth_xdp, we need to add a couple more generic helpers. Notably: * 01: add generic loop unrolling hint helpers; * 04: add helper to get both xdp_desc's DMA address and metadata pointer in one go, saving several cycles and hotpath object code size in drivers (especially when unrolling). Bonus: * 02, 03: convert two drivers which were using custom macros to generic unrolled_count() (trivial, no object code changes). ==================== Link: https://patch.msgid.link/20250206182630.3914318-1-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
/* Copyright(c) 2018 Intel Corporation. */
|
||||
|
||||
#include <linux/bpf_trace.h>
|
||||
#include <linux/unroll.h>
|
||||
#include <net/xdp_sock_drv.h>
|
||||
#include "i40e_txrx_common.h"
|
||||
#include "i40e_xsk.h"
|
||||
@@ -529,7 +530,8 @@ static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *des
|
||||
dma_addr_t dma;
|
||||
u32 i;
|
||||
|
||||
loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
|
||||
unrolled_count(PKTS_PER_BATCH)
|
||||
for (i = 0; i < PKTS_PER_BATCH; i++) {
|
||||
u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]);
|
||||
|
||||
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
/* This value should match the pragma in the loop_unrolled_for
|
||||
/* This value should match the pragma in the unrolled_count()
|
||||
* macro. Why 4? It is strictly empirical. It seems to be a good
|
||||
* compromise between the advantage of having simultaneous outstanding
|
||||
* reads to the DMA array that can hide each others latency and the
|
||||
@@ -14,14 +14,6 @@
|
||||
*/
|
||||
#define PKTS_PER_BATCH 4
|
||||
|
||||
#ifdef __clang__
|
||||
#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for
|
||||
#elif __GNUC__ >= 8
|
||||
#define loop_unrolled_for _Pragma("GCC unroll 4") for
|
||||
#else
|
||||
#define loop_unrolled_for for
|
||||
#endif
|
||||
|
||||
struct i40e_ring;
|
||||
struct i40e_vsi;
|
||||
struct net_device;
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
/* Copyright (c) 2019, Intel Corporation. */
|
||||
|
||||
#include <linux/bpf_trace.h>
|
||||
#include <linux/unroll.h>
|
||||
#include <net/xdp_sock_drv.h>
|
||||
#include <net/xdp.h>
|
||||
#include "ice.h"
|
||||
@@ -989,7 +990,8 @@ static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring,
|
||||
struct ice_tx_desc *tx_desc;
|
||||
u32 i;
|
||||
|
||||
loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
|
||||
unrolled_count(PKTS_PER_BATCH)
|
||||
for (i = 0; i < PKTS_PER_BATCH; i++) {
|
||||
dma_addr_t dma;
|
||||
|
||||
dma = xsk_buff_raw_get_dma(xsk_pool, descs[i].addr);
|
||||
|
||||
@@ -7,14 +7,6 @@
|
||||
|
||||
#define PKTS_PER_BATCH 8
|
||||
|
||||
#ifdef __clang__
|
||||
#define loop_unrolled_for _Pragma("clang loop unroll_count(8)") for
|
||||
#elif __GNUC__ >= 8
|
||||
#define loop_unrolled_for _Pragma("GCC unroll 8") for
|
||||
#else
|
||||
#define loop_unrolled_for for
|
||||
#endif
|
||||
|
||||
struct ice_vsi;
|
||||
|
||||
#ifdef CONFIG_XDP_SOCKETS
|
||||
|
||||
@@ -9,6 +9,50 @@
|
||||
|
||||
#include <linux/args.h>
|
||||
|
||||
#ifdef CONFIG_CC_IS_CLANG
|
||||
#define __pick_unrolled(x, y) _Pragma(#x)
|
||||
#elif CONFIG_GCC_VERSION >= 80000
|
||||
#define __pick_unrolled(x, y) _Pragma(#y)
|
||||
#else
|
||||
#define __pick_unrolled(x, y) /* not supported */
|
||||
#endif
|
||||
|
||||
/**
|
||||
* unrolled - loop attributes to ask the compiler to unroll it
|
||||
*
|
||||
* Usage:
|
||||
*
|
||||
* #define BATCH 8
|
||||
*
|
||||
* unrolled_count(BATCH)
|
||||
* for (u32 i = 0; i < BATCH; i++)
|
||||
* // loop body without cross-iteration dependencies
|
||||
*
|
||||
* This is only a hint and the compiler is free to disable unrolling if it
|
||||
* thinks the count is suboptimal and may hurt performance and/or hugely
|
||||
* increase object code size.
|
||||
* Not having any cross-iteration dependencies (i.e. when iter x + 1 depends
|
||||
* on what iter x will do with variables) is not a strict requirement, but
|
||||
* provides best performance and object code size.
|
||||
* Available only on Clang and GCC 8.x onwards.
|
||||
*/
|
||||
|
||||
/* Ask the compiler to pick an optimal unroll count, Clang only */
|
||||
#define unrolled \
|
||||
__pick_unrolled(clang loop unroll(enable), /* nothing */)
|
||||
|
||||
/* Unroll each @n iterations of the loop */
|
||||
#define unrolled_count(n) \
|
||||
__pick_unrolled(clang loop unroll_count(n), GCC unroll n)
|
||||
|
||||
/* Unroll the whole loop */
|
||||
#define unrolled_full \
|
||||
__pick_unrolled(clang loop unroll(full), GCC unroll 65534)
|
||||
|
||||
/* Never unroll the loop */
|
||||
#define unrolled_none \
|
||||
__pick_unrolled(clang loop unroll(disable), GCC unroll 1)
|
||||
|
||||
#define UNROLL(N, MACRO, args...) CONCATENATE(__UNROLL_, N)(MACRO, args)
|
||||
|
||||
#define __UNROLL_0(MACRO, args...)
|
||||
|
||||
@@ -196,6 +196,23 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
|
||||
return xp_raw_get_data(pool, addr);
|
||||
}
|
||||
|
||||
/**
|
||||
* xsk_buff_raw_get_ctx - get &xdp_desc context
|
||||
* @pool: XSk buff pool desc address belongs to
|
||||
* @addr: desc address (from userspace)
|
||||
*
|
||||
* Wrapper for xp_raw_get_ctx() to be used in drivers, see its kdoc for
|
||||
* details.
|
||||
*
|
||||
* Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata
|
||||
* pointer, if it is present and valid (initialized to %NULL otherwise).
|
||||
*/
|
||||
static inline struct xdp_desc_ctx
|
||||
xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr)
|
||||
{
|
||||
return xp_raw_get_ctx(pool, addr);
|
||||
}
|
||||
|
||||
#define XDP_TXMD_FLAGS_VALID ( \
|
||||
XDP_TXMD_FLAGS_TIMESTAMP | \
|
||||
XDP_TXMD_FLAGS_CHECKSUM | \
|
||||
@@ -207,20 +224,27 @@ xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta)
|
||||
return !(meta->flags & ~XDP_TXMD_FLAGS_VALID);
|
||||
}
|
||||
|
||||
static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
|
||||
static inline struct xsk_tx_metadata *
|
||||
__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data)
|
||||
{
|
||||
struct xsk_tx_metadata *meta;
|
||||
|
||||
if (!pool->tx_metadata_len)
|
||||
return NULL;
|
||||
|
||||
meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len;
|
||||
meta = data - pool->tx_metadata_len;
|
||||
if (unlikely(!xsk_buff_valid_tx_metadata(meta)))
|
||||
return NULL; /* no way to signal the error to the user */
|
||||
|
||||
return meta;
|
||||
}
|
||||
|
||||
static inline struct xsk_tx_metadata *
|
||||
xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
|
||||
{
|
||||
return __xsk_buff_get_metadata(pool, xp_raw_get_data(pool, addr));
|
||||
}
|
||||
|
||||
static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
|
||||
{
|
||||
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
|
||||
@@ -388,12 +412,25 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct xdp_desc_ctx
|
||||
xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr)
|
||||
{
|
||||
return (struct xdp_desc_ctx){ };
|
||||
}
|
||||
|
||||
static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
|
||||
static inline struct xsk_tx_metadata *
|
||||
__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct xsk_tx_metadata *
|
||||
xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -141,6 +141,14 @@ u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max);
|
||||
bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count);
|
||||
void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr);
|
||||
dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr);
|
||||
|
||||
struct xdp_desc_ctx {
|
||||
dma_addr_t dma;
|
||||
struct xsk_tx_metadata *meta;
|
||||
};
|
||||
|
||||
struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr);
|
||||
|
||||
static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb)
|
||||
{
|
||||
return xskb->dma;
|
||||
|
||||
@@ -699,18 +699,56 @@ void xp_free(struct xdp_buff_xsk *xskb)
|
||||
}
|
||||
EXPORT_SYMBOL(xp_free);
|
||||
|
||||
static u64 __xp_raw_get_addr(const struct xsk_buff_pool *pool, u64 addr)
|
||||
{
|
||||
return pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
|
||||
}
|
||||
|
||||
static void *__xp_raw_get_data(const struct xsk_buff_pool *pool, u64 addr)
|
||||
{
|
||||
return pool->addrs + addr;
|
||||
}
|
||||
|
||||
void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
|
||||
{
|
||||
addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
|
||||
return pool->addrs + addr;
|
||||
return __xp_raw_get_data(pool, __xp_raw_get_addr(pool, addr));
|
||||
}
|
||||
EXPORT_SYMBOL(xp_raw_get_data);
|
||||
|
||||
dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
|
||||
static dma_addr_t __xp_raw_get_dma(const struct xsk_buff_pool *pool, u64 addr)
|
||||
{
|
||||
addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
|
||||
return (pool->dma_pages[addr >> PAGE_SHIFT] &
|
||||
~XSK_NEXT_PG_CONTIG_MASK) +
|
||||
(addr & ~PAGE_MASK);
|
||||
}
|
||||
|
||||
dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
|
||||
{
|
||||
return __xp_raw_get_dma(pool, __xp_raw_get_addr(pool, addr));
|
||||
}
|
||||
EXPORT_SYMBOL(xp_raw_get_dma);
|
||||
|
||||
/**
|
||||
* xp_raw_get_ctx - get &xdp_desc context
|
||||
* @pool: XSk buff pool desc address belongs to
|
||||
* @addr: desc address (from userspace)
|
||||
*
|
||||
* Helper for getting desc's DMA address and metadata pointer, if present.
|
||||
* Saves one call on hotpath, double calculation of the actual address,
|
||||
* and inline checks for metadata presence and sanity.
|
||||
*
|
||||
* Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata
|
||||
* pointer, if it is present and valid (initialized to %NULL otherwise).
|
||||
*/
|
||||
struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr)
|
||||
{
|
||||
struct xdp_desc_ctx ret;
|
||||
|
||||
addr = __xp_raw_get_addr(pool, addr);
|
||||
|
||||
ret.dma = __xp_raw_get_dma(pool, addr);
|
||||
ret.meta = __xsk_buff_get_metadata(pool, __xp_raw_get_data(pool, addr));
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(xp_raw_get_ctx);
|
||||
|
||||
Reference in New Issue
Block a user