io_uring: introduce non-circular SQ

Outside of SQPOLL, normally SQ entries are consumed by the time the
submission syscall returns. For those cases we don't need a circular
buffer and the head/tail tracking, instead the kernel can assume that
entries always start from the beginning of the SQ at index 0. This patch
introduces a setup flag doing exactly that. It's a simpler and helps
to keeps SQEs hot in cache.

The feature is optional and enabled by setting IORING_SETUP_SQ_REWIND.
The flag is rejected if passed together with SQPOLL as it'd require
waiting for SQ before each submission. It also requires
IORING_SETUP_NO_SQARRAY, which can be supported but it's unlikely there
will be users, so leave more space for future optimisations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Pavel Begunkov
2026-01-20 20:47:40 +00:00
committed by Jens Axboe
parent 0105b0562a
commit 5247c034a6
3 changed files with 36 additions and 8 deletions

View File

@@ -237,6 +237,18 @@ enum io_uring_sqe_flags_bit {
*/ */
#define IORING_SETUP_SQE_MIXED (1U << 19) #define IORING_SETUP_SQE_MIXED (1U << 19)
/*
* When set, io_uring ignores SQ head and tail and fetches SQEs to submit
* starting from index 0 instead from the index stored in the head pointer.
* IOW, the user should place all SQE at the beginning of the SQ memory
* before issuing a submission syscall.
*
* It requires IORING_SETUP_NO_SQARRAY and is incompatible with
* IORING_SETUP_SQPOLL. The user must also never change the SQ head and tail
* values and keep it set to 0. Any other value is undefined behaviour.
*/
#define IORING_SETUP_SQ_REWIND (1U << 20)
enum io_uring_op { enum io_uring_op {
IORING_OP_NOP, IORING_OP_NOP,
IORING_OP_READV, IORING_OP_READV,

View File

@@ -1945,12 +1945,16 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
{ {
struct io_rings *rings = ctx->rings; struct io_rings *rings = ctx->rings;
/* if (ctx->flags & IORING_SETUP_SQ_REWIND) {
* Ensure any loads from the SQEs are done at this point, ctx->cached_sq_head = 0;
* since once we write the new head, the application could } else {
* write new data to them. /*
*/ * Ensure any loads from the SQEs are done at this point,
smp_store_release(&rings->sq.head, ctx->cached_sq_head); * since once we write the new head, the application could
* write new data to them.
*/
smp_store_release(&rings->sq.head, ctx->cached_sq_head);
}
} }
/* /*
@@ -1996,10 +2000,15 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock) __must_hold(&ctx->uring_lock)
{ {
unsigned int entries = io_sqring_entries(ctx); unsigned int entries;
unsigned int left; unsigned int left;
int ret; int ret;
if (ctx->flags & IORING_SETUP_SQ_REWIND)
entries = ctx->sq_entries;
else
entries = io_sqring_entries(ctx);
entries = min(nr, entries); entries = min(nr, entries);
if (unlikely(!entries)) if (unlikely(!entries))
return 0; return 0;
@@ -2728,6 +2737,12 @@ static int io_uring_sanitise_params(struct io_uring_params *p)
if (flags & ~IORING_SETUP_FLAGS) if (flags & ~IORING_SETUP_FLAGS)
return -EINVAL; return -EINVAL;
if (flags & IORING_SETUP_SQ_REWIND) {
if ((flags & IORING_SETUP_SQPOLL) ||
!(flags & IORING_SETUP_NO_SQARRAY))
return -EINVAL;
}
/* There is no way to mmap rings without a real fd */ /* There is no way to mmap rings without a real fd */
if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) &&
!(flags & IORING_SETUP_NO_MMAP)) !(flags & IORING_SETUP_NO_MMAP))

View File

@@ -69,7 +69,8 @@ struct io_ctx_config {
IORING_SETUP_NO_SQARRAY |\ IORING_SETUP_NO_SQARRAY |\
IORING_SETUP_HYBRID_IOPOLL |\ IORING_SETUP_HYBRID_IOPOLL |\
IORING_SETUP_CQE_MIXED |\ IORING_SETUP_CQE_MIXED |\
IORING_SETUP_SQE_MIXED) IORING_SETUP_SQE_MIXED |\
IORING_SETUP_SQ_REWIND)
#define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\
IORING_ENTER_SQ_WAKEUP |\ IORING_ENTER_SQ_WAKEUP |\