mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-15 20:12:00 -04:00
io_uring: validate user-controlled cq.head in io_cqe_cache_refill()
A fuzzing run reproduced an unkillable io_uring task stuck at ~100% CPU:
[root@fedora io_uring_stress]# ps -ef | grep io_uring
root 1240 1 99 13:36 ? 00:01:35 [io_uring_stress] <defunct>
The task loops inside io_cqring_wait() and never returns to userspace,
and SIGKILL has no effect.
This is caused by the CQ ring exposing rings->cq.head to userspace as
writable, while the authoritative tail lives in kernel-private
ctx->cached_cq_tail. io_cqe_cache_refill() computes free space as an
unsigned subtraction:
free = ctx->cq_entries - min(tail - head, ctx->cq_entries);
If userspace keeps head within [0, tail], the subtraction is well
defined and min() just acts as a defensive clamp. But if userspace
advances head past tail, (tail - head) wraps to a huge value, free
becomes 0, and io_cqe_cache_refill() fails. The CQE is pushed onto the
overflow list and IO_CHECK_CQ_OVERFLOW_BIT is set.
The wait loop in io_cqring_wait() relies on an invariant: refill() only
fails when the CQ is *physically* full, in which case rings->cq.tail has
been advanced to iowq->cq_tail and io_should_wake() returns true. The
tampered head breaks this: refill() fails while the ring is not full, no
OCQE is copied in, rings->cq.tail never catches up, io_should_wake()
stays false, and io_cqring_wait_schedule() keeps returning early because
IO_CHECK_CQ_OVERFLOW_BIT is still set. The result is a tight retry loop
that never returns to userspace.
Introduce io_cqring_queued() as the single point that converts the
(tail, head) pair into a trustworthy queued count. Since the real
head/tail distance is bounded by cq_entries (far below 2^31), a signed
comparison reliably detects userspace moving head past tail; in that
case treat the queue as empty so callers see the full cache as free and
forward progress is preserved.
Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
Link: https://patch.msgid.link/20260514021847.4062782-1-wozizhi@huaweicloud.com
[axboe: fixup commit message, kill 'queued' var, and keep it all in
io_uring.c]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
@@ -686,13 +686,27 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
|
||||
return ocqe;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute queued CQEs for free-space calculation, clamped to cq_entries.
|
||||
*/
|
||||
static unsigned int io_cqring_queued(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_rings *rings = io_get_rings(ctx);
|
||||
int diff;
|
||||
|
||||
diff = (int)(ctx->cached_cq_tail - READ_ONCE(rings->cq.head));
|
||||
if (diff >= 0)
|
||||
return min((unsigned int)diff, ctx->cq_entries);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE
|
||||
* because the ring is a single 16b entry away from wrapping.
|
||||
*/
|
||||
static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off)
|
||||
{
|
||||
if (__io_cqring_events(ctx) < ctx->cq_entries) {
|
||||
if (io_cqring_queued(ctx) < ctx->cq_entries) {
|
||||
struct io_uring_cqe *cqe = &ctx->rings->cqes[off];
|
||||
|
||||
cqe->user_data = 0;
|
||||
@@ -713,7 +727,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
|
||||
{
|
||||
struct io_rings *rings = ctx->rings;
|
||||
unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
|
||||
unsigned int free, queued, len;
|
||||
unsigned int free, len;
|
||||
|
||||
/*
|
||||
* Posting into the CQ when there are pending overflowed CQEs may break
|
||||
@@ -733,9 +747,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
|
||||
off = 0;
|
||||
}
|
||||
|
||||
/* userspace may cheat modifying the tail, be safe and do min */
|
||||
queued = min(__io_cqring_events(ctx), ctx->cq_entries);
|
||||
free = ctx->cq_entries - queued;
|
||||
free = ctx->cq_entries - io_cqring_queued(ctx);
|
||||
/* we need a contiguous range, limit based on the current array offset */
|
||||
len = min(free, ctx->cq_entries - off);
|
||||
if (len < (cqe32 + 1))
|
||||
|
||||
Reference in New Issue
Block a user