From 3799c2570982577551023ae035f5a786cf39a76e Mon Sep 17 00:00:00 2001
From: Maoyi Xie <maoyixie.tju@gmail.com>
Date: Sun, 10 May 2026 16:41:19 +0800
Subject: [PATCH 1/7] io_uring/fdinfo: translate SqThread PID through caller's
 pid_ns

SQPOLL stores current->pid (init_pid_ns view) in sqd->task_pid
at thread creation. fdinfo prints it raw via
seq_printf("SqThread:\t%d\n", sq_pid). A reader inside a
non-initial pid_ns sees the host PID, not the kthread's PID in
the reader's own pid_ns.

The SQPOLL kthread is created with CLONE_THREAD and no
CLONE_NEW*, so it lives in the submitter's pid_ns. An
unprivileged user_ns + pid_ns submitter can read fdinfo and
learn the host PID of a kthread whose in-namespace PID is
different.

Reproducer (mainline 7.0, KASAN): unshare CLONE_NEWUSER |
CLONE_NEWPID | CLONE_NEWNS, mount a private /proc, then have a
grandchild that is pid 1 in the new pid_ns open an io_uring
ring with IORING_SETUP_SQPOLL. /proc/self/task lists {1, 2};
the SQPOLL kthread is pid 2. Before: fdinfo prints
SqThread = <host pid>. After: SqThread = 2.

Use task_pid_nr_ns() against the proc inode's pid_ns to compute
sq_pid, instead of reading the stored sq->task_pid (which holds
the init_pid_ns view). pidfd_show_fdinfo() in kernel/pid.c
follows the same pattern.

Signed-off-by: Maoyi Xie <maoyi.xie@ntu.edu.sg>
Link: https://patch.msgid.link/20260510084119.457578-1-maoyi.xie@ntu.edu.sg
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/fdinfo.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index c2d3e45544bb..001fb542dc11 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -190,8 +190,9 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 			get_task_struct(tsk);
 			rcu_read_unlock();
 			usec = io_sq_cpu_usec(tsk);
+			sq_pid = task_pid_nr_ns(tsk,
+						proc_pid_ns(file_inode(m->file)->i_sb));
 			put_task_struct(tsk);
-			sq_pid = sq->task_pid;
 			sq_cpu = sq->sq_cpu;
 			sq_total_time = usec;
 			sq_work_time = sq->work_time;

From 20c39819a27646573dfa0ac0d01c38895298a6f6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 11 May 2026 10:58:38 -0600
Subject: [PATCH 2/7] io_uring: hold uring_lock when walking link chain in
 io_wq_free_work()

io_wq_free_work() calls io_req_find_next() from io-wq worker context,
which reads and clears req->link without holding any lock. This can
potentially race with other paths that mutate the same chain under
ctx->uring_lock.

Take ctx->uring_lock around the io_req_find_next() call. Only requests
with IO_REQ_LINK_FLAGS reach this path, which is not the hot path.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4ed998d60c09..2ebb0ba37c4f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1452,8 +1452,13 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
 	struct io_kiocb *nxt = NULL;
 
 	if (req_ref_put_and_test_atomic(req)) {
-		if (req->flags & IO_REQ_LINK_FLAGS)
+		if (req->flags & IO_REQ_LINK_FLAGS) {
+			struct io_ring_ctx *ctx = req->ctx;
+
+			mutex_lock(&ctx->uring_lock);
 			nxt = io_req_find_next(req);
+			mutex_unlock(&ctx->uring_lock);
+		}
 		io_free_req(req);
 	}
 	return nxt ? &nxt->work : NULL;

From 49ae66eb8c27375075ffa308cfd4bf25af335d41 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 11 May 2026 10:58:50 -0600
Subject: [PATCH 3/7] io_uring: defer linked-timeout chain splice out of
 hrtimer context

io_link_timeout_fn() is the hrtimer callback that fires when a linked
timeout expires. It currently calls io_remove_next_linked(prev) under
ctx->timeout_lock to splice the timeout request out of the link chain.
This is the only chain-mutation site that runs without ctx->uring_lock,
because hrtimer callbacks cannot take a mutex. Defer the splicing until
the task_work callback.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/timeout.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index e2595cae2b07..6353a4d979dc 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -284,6 +284,10 @@ static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
 	struct io_timeout *timeout = io_kiocb_to_cmd(link, struct io_timeout);
 
 	io_remove_next_linked(req);
+
+	/* If this is NULL, then timer already claimed it and will complete it */
+	if (!timeout->head)
+		return NULL;
 	timeout->head = NULL;
 	if (hrtimer_try_to_cancel(&io->timer) != -1) {
 		list_del(&timeout->list);
@@ -367,6 +371,14 @@ static void io_req_task_link_timeout(struct io_tw_req tw_req, io_tw_token_t tw)
 	int ret;
 
 	if (prev) {
+		/*
+		 * splice the linked timeout out of prev's chain if the regular
+		 * completion path didn't already do it.
+		 */
+		if (prev->link == req)
+			prev->link = req->link;
+		req->link = NULL;
+
 		if (!tw.cancel) {
 			struct io_cancel_data cd = {
 				.ctx		= req->ctx,
@@ -401,10 +413,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 
 	/*
 	 * We don't expect the list to be empty, that will only happen if we
-	 * race with the completion of the linked work.
+	 * race with the completion of the linked work. Splice of prev is
+	 * done in io_req_task_link_timeout(), if needed.
 	 */
 	if (prev) {
-		io_remove_next_linked(prev);
 		if (!req_ref_inc_not_zero(prev))
 			prev = NULL;
 	}

From a65855ec34aed84e1e5b4aea0323cc1745f83a5c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 11 May 2026 10:58:56 -0600
Subject: [PATCH 4/7] io_uring: hold uring_lock across io_kill_timeouts() in
 cancel path

io_uring_try_cancel_requests() dropped ctx->uring_lock before calling
io_kill_timeouts(), which walks each timeout's link chain via
io_match_task() to test REQ_F_INFLIGHT. With chain mutation now
serialized by ctx->uring_lock, that walk needs the lock too.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/cancel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 5e5eb9cfc7cd..4aa3103ba9c3 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -561,8 +561,8 @@ __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	ret |= io_waitid_remove_all(ctx, tctx, cancel_all);
 	ret |= io_futex_remove_all(ctx, tctx, cancel_all);
 	ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all);
-	mutex_unlock(&ctx->uring_lock);
 	ret |= io_kill_timeouts(ctx, tctx, cancel_all);
+	mutex_unlock(&ctx->uring_lock);
 	if (tctx)
 		ret |= io_run_task_work() > 0;
 	else

From 5f7c7c63ffb1a187eb90c80864469db45f3bd2a8 Mon Sep 17 00:00:00 2001
From: Yang Xiuwei <yangxiuwei@kylinos.cn>
Date: Wed, 13 May 2026 17:43:03 +0800
Subject: [PATCH 5/7] io_uring/rw: drop unused attr_type_mask from
 io_prep_rw_pi()

io_prep_rw_pi() never used the attr_type_mask argument. Callers already
validate sqe->attr_type_mask before invoking the helper (only
IORING_RW_ATTR_FLAG_PI is supported today). Remove the dead parameter to
avoid implying further interpretation happens here.

Signed-off-by: Yang Xiuwei <yangxiuwei@kylinos.cn>
Link: https://patch.msgid.link/20260513094303.866533-1-yangxiuwei@kylinos.cn
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rw.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/rw.c b/io_uring/rw.c
index e729e0e7657e..0c4834645279 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -230,7 +230,7 @@ static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb)
 }
 
 static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir,
-			 u64 attr_ptr, u64 attr_type_mask)
+			 u64 attr_ptr)
 {
 	struct io_uring_attr_pi pi_attr;
 	struct io_async_rw *io;
@@ -305,7 +305,7 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			return -EINVAL;
 
 		attr_ptr = READ_ONCE(sqe->attr_ptr);
-		return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask);
+		return io_prep_rw_pi(req, rw, ddir, attr_ptr);
 	}
 	return 0;
 }

From d6a2d7b04b5a093021a7a0e2e69e9d5237dfa8cc Mon Sep 17 00:00:00 2001
From: Nicholas Carlini <nicholas@carlini.com>
Date: Mon, 11 May 2026 18:02:16 +0000
Subject: [PATCH 6/7] io-wq: check that the predecessor is hashed in
 io_wq_remove_pending()

io_wq_remove_pending() needs to fix up wq->hash_tail[] if the cancelled
work was the tail of its hash bucket. When doing this, it checks whether
the preceding entry in acct->work_list has the same hash value, but
never checks that the predecessor is hashed at all. io_get_work_hash()
is simply atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT, and the hash
bits are never set for non-hashed work, so it returns 0. Thus, when a
hashed bucket-0 work is cancelled while a non-hashed work is its list
predecessor, the check spuriously passes and a pointer to the non-hashed
io_kiocb is stored in wq->hash_tail[0].

Because non-hashed work is dequeued via the fast path in
io_get_next_work(), which never touches hash_tail[], the stale pointer
is never cleared. Therefore, after the non-hashed io_kiocb completes and
is freed back to req_cachep, wq->hash_tail[0] is a dangling pointer. The
io_wq is per-task (tctx->io_wq) and survives ring open/close, so the
dangling pointer persists for the lifetime of the task; the next hashed
bucket-0 enqueue dereferences it in io_wq_insert_work() and
wq_list_add_after() writes through freed memory.

Add the missing io_wq_is_hashed() check so a non-hashed predecessor
never inherits a hash_tail[] slot.

Cc: stable@vger.kernel.org
Fixes: 204361a77f40 ("io-wq: fix hang after cancelling pending hashed work")
Signed-off-by: Nicholas Carlini <nicholas@carlini.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 7a9f94a0ce6f..8cc7b47d3089 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -1124,7 +1124,8 @@ static inline void io_wq_remove_pending(struct io_wq *wq,
 	if (io_wq_is_hashed(work) && work == wq->hash_tail[hash]) {
 		if (prev)
 			prev_work = container_of(prev, struct io_wq_work, list);
-		if (prev_work && io_get_work_hash(prev_work) == hash)
+		if (prev_work && io_wq_is_hashed(prev_work) &&
+		    io_get_work_hash(prev_work) == hash)
 			wq->hash_tail[hash] = prev_work;
 		else
 			wq->hash_tail[hash] = NULL;

From f44d38a31f1802b7222adaea9ee69f9d280f698a Mon Sep 17 00:00:00 2001
From: Zizhi Wo <wozizhi@huawei.com>
Date: Thu, 14 May 2026 10:18:47 +0800
Subject: [PATCH 7/7] io_uring: validate user-controlled cq.head in
 io_cqe_cache_refill()

A fuzzing run reproduced an unkillable io_uring task stuck at ~100% CPU:

[root@fedora io_uring_stress]# ps -ef | grep io_uring
root  1240  1  99 13:36 ?  00:01:35 [io_uring_stress] <defunct>

The task loops inside io_cqring_wait() and never returns to userspace,
and SIGKILL has no effect.

This is caused by the CQ ring exposing rings->cq.head to userspace as
writable, while the authoritative tail lives in kernel-private
ctx->cached_cq_tail. io_cqe_cache_refill() computes free space as an
unsigned subtraction:

    free = ctx->cq_entries - min(tail - head, ctx->cq_entries);

If userspace keeps head within [0, tail], the subtraction is well
defined and min() just acts as a defensive clamp. But if userspace
advances head past tail, (tail - head) wraps to a huge value, free
becomes 0, and io_cqe_cache_refill() fails. The CQE is pushed onto the
overflow list and IO_CHECK_CQ_OVERFLOW_BIT is set.

The wait loop in io_cqring_wait() relies on an invariant: refill() only
fails when the CQ is *physically* full, in which case rings->cq.tail has
been advanced to iowq->cq_tail and io_should_wake() returns true. The
tampered head breaks this: refill() fails while the ring is not full, no
OCQE is copied in, rings->cq.tail never catches up, io_should_wake()
stays false, and io_cqring_wait_schedule() keeps returning early because
IO_CHECK_CQ_OVERFLOW_BIT is still set. The result is a tight retry loop
that never returns to userspace.

Introduce io_cqring_queued() as the single point that converts the
(tail, head) pair into a trustworthy queued count. Since the real
head/tail distance is bounded by cq_entries (far below 2^31), a signed
comparison reliably detects userspace moving head past tail; in that
case treat the queue as empty so callers see the full cache as free and
forward progress is preserved.

Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
Link: https://patch.msgid.link/20260514021847.4062782-1-wozizhi@huaweicloud.com
[axboe: fixup commit message, kill 'queued' var, and keep it all in
io_uring.c]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2ebb0ba37c4f..036145ee466c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -686,13 +686,27 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
 	return ocqe;
 }
 
+/*
+ * Compute queued CQEs for free-space calculation, clamped to cq_entries.
+ */
+static unsigned int io_cqring_queued(struct io_ring_ctx *ctx)
+{
+	struct io_rings *rings = io_get_rings(ctx);
+	int diff;
+
+	diff = (int)(ctx->cached_cq_tail - READ_ONCE(rings->cq.head));
+	if (diff >= 0)
+		return min((unsigned int)diff, ctx->cq_entries);
+	return 0;
+}
+
 /*
  * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE
  * because the ring is a single 16b entry away from wrapping.
  */
 static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off)
 {
-	if (__io_cqring_events(ctx) < ctx->cq_entries) {
+	if (io_cqring_queued(ctx) < ctx->cq_entries) {
 		struct io_uring_cqe *cqe = &ctx->rings->cqes[off];
 
 		cqe->user_data = 0;
@@ -713,7 +727,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
 {
 	struct io_rings *rings = ctx->rings;
 	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
-	unsigned int free, queued, len;
+	unsigned int free, len;
 
 	/*
 	 * Posting into the CQ when there are pending overflowed CQEs may break
@@ -733,9 +747,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
 		off = 0;
 	}
 
-	/* userspace may cheat modifying the tail, be safe and do min */
-	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
-	free = ctx->cq_entries - queued;
+	free = ctx->cq_entries - io_cqring_queued(ctx);
 	/* we need a contiguous range, limit based on the current array offset */
 	len = min(free, ctx->cq_entries - off);
 	if (len < (cqe32 + 1))