From 3799c2570982577551023ae035f5a786cf39a76e Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Sun, 10 May 2026 16:41:19 +0800 Subject: [PATCH 1/7] io_uring/fdinfo: translate SqThread PID through caller's pid_ns SQPOLL stores current->pid (init_pid_ns view) in sqd->task_pid at thread creation. fdinfo prints it raw via seq_printf("SqThread:\t%d\n", sq_pid). A reader inside a non-initial pid_ns sees the host PID, not the kthread's PID in the reader's own pid_ns. The SQPOLL kthread is created with CLONE_THREAD and no CLONE_NEW*, so it lives in the submitter's pid_ns. An unprivileged user_ns + pid_ns submitter can read fdinfo and learn the host PID of a kthread whose in-namespace PID is different. Reproducer (mainline 7.0, KASAN): unshare CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS, mount a private /proc, then have a grandchild that is pid 1 in the new pid_ns open an io_uring ring with IORING_SETUP_SQPOLL. /proc/self/task lists {1, 2}; the SQPOLL kthread is pid 2. Before: fdinfo prints SqThread = . After: SqThread = 2. Use task_pid_nr_ns() against the proc inode's pid_ns to compute sq_pid, instead of reading the stored sq->task_pid (which holds the init_pid_ns view). pidfd_show_fdinfo() in kernel/pid.c follows the same pattern. Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260510084119.457578-1-maoyi.xie@ntu.edu.sg Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index c2d3e45544bb..001fb542dc11 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -190,8 +190,9 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) get_task_struct(tsk); rcu_read_unlock(); usec = io_sq_cpu_usec(tsk); + sq_pid = task_pid_nr_ns(tsk, + proc_pid_ns(file_inode(m->file)->i_sb)); put_task_struct(tsk); - sq_pid = sq->task_pid; sq_cpu = sq->sq_cpu; sq_total_time = usec; sq_work_time = sq->work_time; From 20c39819a27646573dfa0ac0d01c38895298a6f6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 May 2026 10:58:38 -0600 Subject: [PATCH 2/7] io_uring: hold uring_lock when walking link chain in io_wq_free_work() io_wq_free_work() calls io_req_find_next() from io-wq worker context, which reads and clears req->link without holding any lock. This can potentially race with other paths that mutate the same chain under ctx->uring_lock. Take ctx->uring_lock around the io_req_find_next() call. Only requests with IO_REQ_LINK_FLAGS reach this path, which is not the hot path. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4ed998d60c09..2ebb0ba37c4f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1452,8 +1452,13 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work) struct io_kiocb *nxt = NULL; if (req_ref_put_and_test_atomic(req)) { - if (req->flags & IO_REQ_LINK_FLAGS) + if (req->flags & IO_REQ_LINK_FLAGS) { + struct io_ring_ctx *ctx = req->ctx; + + mutex_lock(&ctx->uring_lock); nxt = io_req_find_next(req); + mutex_unlock(&ctx->uring_lock); + } io_free_req(req); } return nxt ? &nxt->work : NULL; From 49ae66eb8c27375075ffa308cfd4bf25af335d41 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 May 2026 10:58:50 -0600 Subject: [PATCH 3/7] io_uring: defer linked-timeout chain splice out of hrtimer context io_link_timeout_fn() is the hrtimer callback that fires when a linked timeout expires. It currently calls io_remove_next_linked(prev) under ctx->timeout_lock to splice the timeout request out of the link chain. This is the only chain-mutation site that runs without ctx->uring_lock, because hrtimer callbacks cannot take a mutex. Defer the splicing until the task_work callback. Signed-off-by: Jens Axboe --- io_uring/timeout.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index e2595cae2b07..6353a4d979dc 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -284,6 +284,10 @@ static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, struct io_timeout *timeout = io_kiocb_to_cmd(link, struct io_timeout); io_remove_next_linked(req); + + /* If this is NULL, then timer already claimed it and will complete it */ + if (!timeout->head) + return NULL; timeout->head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&timeout->list); @@ -367,6 +371,14 @@ static void io_req_task_link_timeout(struct io_tw_req tw_req, io_tw_token_t tw) int ret; if (prev) { + /* + * splice the linked timeout out of prev's chain if the regular + * completion path didn't already do it. + */ + if (prev->link == req) + prev->link = req->link; + req->link = NULL; + if (!tw.cancel) { struct io_cancel_data cd = { .ctx = req->ctx, @@ -401,10 +413,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) /* * We don't expect the list to be empty, that will only happen if we - * race with the completion of the linked work. + * race with the completion of the linked work. Splice of prev is + * done in io_req_task_link_timeout(), if needed. */ if (prev) { - io_remove_next_linked(prev); if (!req_ref_inc_not_zero(prev)) prev = NULL; } From a65855ec34aed84e1e5b4aea0323cc1745f83a5c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 May 2026 10:58:56 -0600 Subject: [PATCH 4/7] io_uring: hold uring_lock across io_kill_timeouts() in cancel path io_uring_try_cancel_requests() dropped ctx->uring_lock before calling io_kill_timeouts(), which walks each timeout's link chain via io_match_task() to test REQ_F_INFLIGHT. With chain mutation now serialized by ctx->uring_lock, that walk needs the lock too. Signed-off-by: Jens Axboe --- io_uring/cancel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 5e5eb9cfc7cd..4aa3103ba9c3 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -561,8 +561,8 @@ __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ret |= io_waitid_remove_all(ctx, tctx, cancel_all); ret |= io_futex_remove_all(ctx, tctx, cancel_all); ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); - mutex_unlock(&ctx->uring_lock); ret |= io_kill_timeouts(ctx, tctx, cancel_all); + mutex_unlock(&ctx->uring_lock); if (tctx) ret |= io_run_task_work() > 0; else From 5f7c7c63ffb1a187eb90c80864469db45f3bd2a8 Mon Sep 17 00:00:00 2001 From: Yang Xiuwei Date: Wed, 13 May 2026 17:43:03 +0800 Subject: [PATCH 5/7] io_uring/rw: drop unused attr_type_mask from io_prep_rw_pi() io_prep_rw_pi() never used the attr_type_mask argument. Callers already validate sqe->attr_type_mask before invoking the helper (only IORING_RW_ATTR_FLAG_PI is supported today). Remove the dead parameter to avoid implying further interpretation happens here. Signed-off-by: Yang Xiuwei Link: https://patch.msgid.link/20260513094303.866533-1-yangxiuwei@kylinos.cn Signed-off-by: Jens Axboe --- io_uring/rw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index e729e0e7657e..0c4834645279 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -230,7 +230,7 @@ static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) } static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, - u64 attr_ptr, u64 attr_type_mask) + u64 attr_ptr) { struct io_uring_attr_pi pi_attr; struct io_async_rw *io; @@ -305,7 +305,7 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EINVAL; attr_ptr = READ_ONCE(sqe->attr_ptr); - return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); + return io_prep_rw_pi(req, rw, ddir, attr_ptr); } return 0; } From d6a2d7b04b5a093021a7a0e2e69e9d5237dfa8cc Mon Sep 17 00:00:00 2001 From: Nicholas Carlini Date: Mon, 11 May 2026 18:02:16 +0000 Subject: [PATCH 6/7] io-wq: check that the predecessor is hashed in io_wq_remove_pending() io_wq_remove_pending() needs to fix up wq->hash_tail[] if the cancelled work was the tail of its hash bucket. When doing this, it checks whether the preceding entry in acct->work_list has the same hash value, but never checks that the predecessor is hashed at all. io_get_work_hash() is simply atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT, and the hash bits are never set for non-hashed work, so it returns 0. Thus, when a hashed bucket-0 work is cancelled while a non-hashed work is its list predecessor, the check spuriously passes and a pointer to the non-hashed io_kiocb is stored in wq->hash_tail[0]. Because non-hashed work is dequeued via the fast path in io_get_next_work(), which never touches hash_tail[], the stale pointer is never cleared. Therefore, after the non-hashed io_kiocb completes and is freed back to req_cachep, wq->hash_tail[0] is a dangling pointer. The io_wq is per-task (tctx->io_wq) and survives ring open/close, so the dangling pointer persists for the lifetime of the task; the next hashed bucket-0 enqueue dereferences it in io_wq_insert_work() and wq_list_add_after() writes through freed memory. Add the missing io_wq_is_hashed() check so a non-hashed predecessor never inherits a hash_tail[] slot. Cc: stable@vger.kernel.org Fixes: 204361a77f40 ("io-wq: fix hang after cancelling pending hashed work") Signed-off-by: Nicholas Carlini Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 7a9f94a0ce6f..8cc7b47d3089 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -1124,7 +1124,8 @@ static inline void io_wq_remove_pending(struct io_wq *wq, if (io_wq_is_hashed(work) && work == wq->hash_tail[hash]) { if (prev) prev_work = container_of(prev, struct io_wq_work, list); - if (prev_work && io_get_work_hash(prev_work) == hash) + if (prev_work && io_wq_is_hashed(prev_work) && + io_get_work_hash(prev_work) == hash) wq->hash_tail[hash] = prev_work; else wq->hash_tail[hash] = NULL; From f44d38a31f1802b7222adaea9ee69f9d280f698a Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Thu, 14 May 2026 10:18:47 +0800 Subject: [PATCH 7/7] io_uring: validate user-controlled cq.head in io_cqe_cache_refill() A fuzzing run reproduced an unkillable io_uring task stuck at ~100% CPU: [root@fedora io_uring_stress]# ps -ef | grep io_uring root 1240 1 99 13:36 ? 00:01:35 [io_uring_stress] The task loops inside io_cqring_wait() and never returns to userspace, and SIGKILL has no effect. This is caused by the CQ ring exposing rings->cq.head to userspace as writable, while the authoritative tail lives in kernel-private ctx->cached_cq_tail. io_cqe_cache_refill() computes free space as an unsigned subtraction: free = ctx->cq_entries - min(tail - head, ctx->cq_entries); If userspace keeps head within [0, tail], the subtraction is well defined and min() just acts as a defensive clamp. But if userspace advances head past tail, (tail - head) wraps to a huge value, free becomes 0, and io_cqe_cache_refill() fails. The CQE is pushed onto the overflow list and IO_CHECK_CQ_OVERFLOW_BIT is set. The wait loop in io_cqring_wait() relies on an invariant: refill() only fails when the CQ is *physically* full, in which case rings->cq.tail has been advanced to iowq->cq_tail and io_should_wake() returns true. The tampered head breaks this: refill() fails while the ring is not full, no OCQE is copied in, rings->cq.tail never catches up, io_should_wake() stays false, and io_cqring_wait_schedule() keeps returning early because IO_CHECK_CQ_OVERFLOW_BIT is still set. The result is a tight retry loop that never returns to userspace. Introduce io_cqring_queued() as the single point that converts the (tail, head) pair into a trustworthy queued count. Since the real head/tail distance is bounded by cq_entries (far below 2^31), a signed comparison reliably detects userspace moving head past tail; in that case treat the queue as empty so callers see the full cache as free and forward progress is preserved. Suggested-by: Jens Axboe Signed-off-by: Zizhi Wo Link: https://patch.msgid.link/20260514021847.4062782-1-wozizhi@huaweicloud.com [axboe: fixup commit message, kill 'queued' var, and keep it all in io_uring.c] Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2ebb0ba37c4f..036145ee466c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -686,13 +686,27 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, return ocqe; } +/* + * Compute queued CQEs for free-space calculation, clamped to cq_entries. + */ +static unsigned int io_cqring_queued(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = io_get_rings(ctx); + int diff; + + diff = (int)(ctx->cached_cq_tail - READ_ONCE(rings->cq.head)); + if (diff >= 0) + return min((unsigned int)diff, ctx->cq_entries); + return 0; +} + /* * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE * because the ring is a single 16b entry away from wrapping. */ static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) { - if (__io_cqring_events(ctx) < ctx->cq_entries) { + if (io_cqring_queued(ctx) < ctx->cq_entries) { struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; cqe->user_data = 0; @@ -713,7 +727,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); - unsigned int free, queued, len; + unsigned int free, len; /* * Posting into the CQ when there are pending overflowed CQEs may break @@ -733,9 +747,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) off = 0; } - /* userspace may cheat modifying the tail, be safe and do min */ - queued = min(__io_cqring_events(ctx), ctx->cq_entries); - free = ctx->cq_entries - queued; + free = ctx->cq_entries - io_cqring_queued(ctx); /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); if (len < (cqe32 + 1))