From 326941b22806cbf2df1fbfe902b7908b368cce42 Mon Sep 17 00:00:00 2001
From: Longxuan Yu <ylong030@ucr.edu>
Date: Sun, 12 Apr 2026 16:38:20 +0800
Subject: [PATCH 01/15] io_uring/poll: fix signed comparison in
 io_poll_get_ownership()

io_poll_get_ownership() uses a signed comparison to check whether
poll_refs has reached the threshold for the slowpath:

    if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))

atomic_read() returns int (signed). When IO_POLL_CANCEL_FLAG
(BIT(31)) is set in poll_refs, the value becomes negative in
signed arithmetic, so the >= 128 comparison always evaluates to
false and the slowpath is never taken.

Fix this by casting the atomic_read() result to unsigned int
before the comparison, so that the cancel flag is treated as a
large positive value and correctly triggers the slowpath.

Fixes: a26a35e9019f ("io_uring: make poll refs more robust")
Cc: stable@vger.kernel.org
Reported-by: Yifan Wu <yifanwucs@gmail.com>
Reported-by: Juefei Pu <tomapufckgml@gmail.com>
Co-developed-by: Yuan Tan <yuantan098@gmail.com>
Signed-off-by: Yuan Tan <yuantan098@gmail.com>
Suggested-by: Xin Liu <bird@lzu.edu.cn>
Tested-by: Zhengchuan Liang <zcliangcn@gmail.com>
Signed-off-by: Longxuan Yu <ylong030@ucr.edu>
Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://patch.msgid.link/3a3508b08bcd7f1bc3beff848ae6e1d73d355043.1775965597.git.ylong030@ucr.edu
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 74eef7884159..6834e2db937e 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -93,7 +93,7 @@ static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
  */
 static inline bool io_poll_get_ownership(struct io_kiocb *req)
 {
-	if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
+	if (unlikely((unsigned int)atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
 		return io_poll_get_ownership_slowpath(req);
 	return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
 }

From ee5417fd02cabb6235a89daf5142ffde9aa957fd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 15 Apr 2026 14:22:16 -0600
Subject: [PATCH 02/15] io_uring/tctx: check for setup tctx->io_wq before
 teardown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As with the idling code before it, the error exit path should check for
a NULL tctx->io_wq before calling io_wq_put_and_exit().

Fixes: 7880174e1e5e ("io_uring/tctx: clean up __io_uring_add_tctx_node() error handling")
Reported-by: Dan Carpenter <error27@gmail.com>
Reviewed-by: Clément Léger <cleger@meta.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/tctx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 61533f30494f..c011a593c0ad 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -171,7 +171,8 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 	}
 	if (!current->io_uring) {
 err_free:
-		io_wq_put_and_exit(tctx->io_wq);
+		if (tctx->io_wq)
+			io_wq_put_and_exit(tctx->io_wq);
 		percpu_counter_destroy(&tctx->inflight);
 		kfree(tctx);
 	}

From 41859843f27dd5c8d3bc43489ad9196c96d39f2b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 16 Apr 2026 10:05:41 -0600
Subject: [PATCH 03/15] io_uring/tctx: mark io_wq as exiting before error path
 teardown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

syzbot reports that it's hitting the below condition for exiting an
io_wq context:

WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state))

in io_wq_put_and_exit(), which can be triggered with memory allocation
fault injection. Ensure that the io_wq is marked as exiting to silence
this warning trigger.

Reported-by: syzbot+79a4cc863a8db58cd92b@syzkaller.appspotmail.com
Fixes: 7880174e1e5e ("io_uring/tctx: clean up __io_uring_add_tctx_node() error handling")
Reviewed-by: Clément Léger <cleger@meta.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/tctx.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index c011a593c0ad..80366320276d 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -171,8 +171,10 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 	}
 	if (!current->io_uring) {
 err_free:
-		if (tctx->io_wq)
+		if (tctx->io_wq) {
+			io_wq_exit_start(tctx->io_wq);
 			io_wq_put_and_exit(tctx->io_wq);
+		}
 		percpu_counter_destroy(&tctx->inflight);
 		kfree(tctx);
 	}

From 42a702aaedf54aa8056fc429fc757a600182e5f7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 20 Apr 2026 14:04:00 +0000
Subject: [PATCH 04/15] io_uring: fix iowq_limits data race in tctx node
 addition

__io_uring_add_tctx_node() reads ctx->int_flags and
ctx->iowq_limits[0..1] without holding ctx->uring_lock, while
io_register_iowq_max_workers() writes these same fields under the lock.

Mostly an application problem if you try and make these race, but let's
silence KCSAN by just grabbing the ->uring_lock around the operation.
This is a slow path operation anyway, and ->uring_lock will be grabbed
by submission right after anyway.

Fixes: 2e480058ddc2 ("io-wq: provide a way to limit max number of workers")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/tctx.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 80366320276d..6af62ca9baba 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -146,9 +146,13 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 		if (IS_ERR(tctx))
 			return PTR_ERR(tctx);
 
-		if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) {
-			unsigned int limits[2] = { ctx->iowq_limits[0],
-						   ctx->iowq_limits[1], };
+		if (data_race(ctx->int_flags) & IO_RING_F_IOWQ_LIMITS_SET) {
+			unsigned int limits[2];
+
+			mutex_lock(&ctx->uring_lock);
+			limits[0] = ctx->iowq_limits[0];
+			limits[1] = ctx->iowq_limits[1];
+			mutex_unlock(&ctx->uring_lock);
 
 			ret = io_wq_max_workers(tctx->io_wq, limits);
 			if (ret)

From 8e1f412b5bc690cb72b3303a1ae0d42955e5e2b3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 20 Apr 2026 14:06:00 +0000
Subject: [PATCH 05/15] io_uring: fix spurious fput in registered ring path

Fix an issue with io_uring_ctx_get_file() not gating fput() on whether
or not the file descriptor is a registered/direct one or not.

Fixes: c5e9f6a96bf7 ("io_uring: unify getting ctx from passed in file descriptor")
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index dd6326dc5f88..4ed998d60c09 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2575,7 +2575,8 @@ struct file *io_uring_ctx_get_file(unsigned int fd, bool registered)
 		return ERR_PTR(-EBADF);
 	if (io_is_uring_fops(file))
 		return file;
-	fput(file);
+	if (!registered)
+		fput(file);
 	return ERR_PTR(-EOPNOTSUPP);
 }
 

From 53262c91f7b81f96495ff24e9d1fa8b1632e69c8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 20 Apr 2026 13:14:54 -0600
Subject: [PATCH 06/15] io_uring/rsrc: unify nospec indexing for direct
 descriptors

For file updates, the node reset isn't capping the value via
array_index_nospec() like the other paths do. Ensure it's all sane and
have the update path do the proper capping as well.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 3 +++
 io_uring/rsrc.h | 9 +++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index fd36e0e319a2..c042054c3b5f 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -238,6 +238,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			continue;
 
 		i = up->offset + done;
+		if (i >= ctx->file_table.data.nr)
+			break;
+		i = array_index_nospec(i, ctx->file_table.data.nr);
 		if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
 			io_file_bitmap_clear(&ctx->file_table, i);
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index cff0f8834c35..44e3386f7c1c 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -109,10 +109,15 @@ static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node
 }
 
 static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx,
-				      struct io_rsrc_data *data, int index)
+				      struct io_rsrc_data *data,
+				      unsigned int index)
 {
-	struct io_rsrc_node *node = data->nodes[index];
+	struct io_rsrc_node *node;
 
+	if (index >= data->nr)
+		return false;
+	index = array_index_nospec(index, data->nr);
+	node = data->nodes[index];
 	if (!node)
 		return false;
 	io_put_rsrc_node(ctx, node);

From 02b8d41c17630493f63c7785c873e327fa9b76a6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 20 Apr 2026 13:15:41 -0600
Subject: [PATCH 07/15] io_uring/rsrc: use kvfree() for the imu cache

Currently anything that requires kvmalloc_flex() for allocations will
not get re-cached, and hence the cache freeing path is correct in that
it always uses kfree() to free the allocated memory. But this seems a
bit fragile as it's something that could get mix should that situation
change, so switch io_free_imu() and io_alloc_cache_free() to use kvfree
as the desctructor.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/alloc_cache.h | 2 +-
 io_uring/rsrc.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 45fcd8b3b824..962b6e2d04cc 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -64,7 +64,7 @@ static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp)
 static inline void io_cache_free(struct io_alloc_cache *cache, void *obj)
 {
 	if (!io_alloc_cache_put(cache, obj))
-		kfree(obj);
+		kvfree(obj);
 }
 
 #endif
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index c042054c3b5f..650303626be6 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -168,7 +168,7 @@ bool io_rsrc_cache_init(struct io_ring_ctx *ctx)
 void io_rsrc_cache_free(struct io_ring_ctx *ctx)
 {
 	io_alloc_cache_free(&ctx->node_cache, kfree);
-	io_alloc_cache_free(&ctx->imu_cache, kfree);
+	io_alloc_cache_free(&ctx->imu_cache, kvfree);
 }
 
 static void io_clear_table_tags(struct io_rsrc_data *data)

From 79968834558774bdc5de4b5503d412df632646aa Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 20 Apr 2026 13:16:19 -0600
Subject: [PATCH 08/15] io_uring/rw: add defensive hardening for negative kbuf
 lengths

No real bug here, just being a bit defensive in ensuring that whatever
gets passed into io_put_kbuf() is always >= 0 and not some random error
value.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rw.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/rw.c b/io_uring/rw.c
index 20654deff84d..e729e0e7657e 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -580,7 +580,7 @@ void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw)
 	io_req_io_end(req);
 
 	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
-		req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL);
+		req->cqe.flags |= io_put_kbuf(req, max(req->cqe.res, 0), NULL);
 
 	io_req_rw_cleanup(req, 0);
 	io_req_task_complete(tw_req, tw);
@@ -1379,7 +1379,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		list_del(&req->iopoll_node);
 		wq_list_add_tail(&req->comp_list, &ctx->submit_state.compl_reqs);
 		nr_events++;
-		req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL);
+		req->cqe.flags = io_put_kbuf(req, max(req->cqe.res, 0), NULL);
 		if (!io_is_uring_cmd(req))
 			io_req_rw_cleanup(req, 0);
 	}

From 7faaa6812aba550c24bffdfd9399568223c8a477 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 20 Apr 2026 14:24:50 -0600
Subject: [PATCH 09/15] io_uring/futex: ensure partial wakes are appropriately
 dequeued

If a FUTEX_WAITV vectored operation is only partially woken, we
should call __futex_wake_mark() on the queue to account for that.
If not, then a later wakeup will wake the same entry, rather than
the next one in line.

Fixes: 8f350194d5cfd ("io_uring: add support for vectored futex waits")
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/futex.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/io_uring/futex.c b/io_uring/futex.c
index fd503c24b428..9cc1788ef4c6 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -159,8 +159,10 @@ static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q)
 	struct io_kiocb *req = q->wake_data;
 	struct io_futexv_data *ifd = req->async_data;
 
-	if (!io_futexv_claim(ifd))
+	if (!io_futexv_claim(ifd)) {
+		__futex_wake_mark(q);
 		return;
+	}
 	if (unlikely(!__futex_wake_mark(q)))
 		return;
 

From 45cd95763e198d74d369ede43aef0b1955b8dea4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 20 Apr 2026 13:41:38 -0600
Subject: [PATCH 10/15] io_uring/register: fix ring resizing with mixed/large
 SQEs/CQEs

The ring resizing only properly handles "normal" sized SQEs or CQEs, if
there are pending entries around a resize. This normally should not be
the case, but the code is supposed to handle this regardless.

For the mixed SQE/CQE cases, the current copying works fine as they
are indexed in the same way. Each half is just copied separately. But
for fixed large SQEs and CQEs, the iteration and copy need to take that
into account.

Cc: stable@kernel.org
Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS")
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/register.c | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/io_uring/register.c b/io_uring/register.c
index 24e593332d1a..dce5e2f9cf77 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -599,10 +599,20 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 	if (tail - old_head > p->sq_entries)
 		goto overflow;
 	for (i = old_head; i < tail; i++) {
-		unsigned src_head = i & (ctx->sq_entries - 1);
-		unsigned dst_head = i & (p->sq_entries - 1);
+		unsigned index, dst_mask, src_mask;
+		size_t sq_size;
 
-		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
+		index = i;
+		sq_size = sizeof(struct io_uring_sqe);
+		src_mask = ctx->sq_entries - 1;
+		dst_mask = p->sq_entries - 1;
+		if (ctx->flags & IORING_SETUP_SQE128) {
+			index <<= 1;
+			sq_size <<= 1;
+			src_mask = (ctx->sq_entries << 1) - 1;
+			dst_mask = (p->sq_entries << 1) - 1;
+		}
+		memcpy(&n.sq_sqes[index & dst_mask], &o.sq_sqes[index & src_mask], sq_size);
 	}
 	WRITE_ONCE(n.rings->sq.head, old_head);
 	WRITE_ONCE(n.rings->sq.tail, tail);
@@ -619,10 +629,20 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 		goto out;
 	}
 	for (i = old_head; i < tail; i++) {
-		unsigned src_head = i & (ctx->cq_entries - 1);
-		unsigned dst_head = i & (p->cq_entries - 1);
+		unsigned index, dst_mask, src_mask;
+		size_t cq_size;
 
-		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
+		index = i;
+		cq_size = sizeof(struct io_uring_cqe);
+		src_mask = ctx->cq_entries - 1;
+		dst_mask = p->cq_entries - 1;
+		if (ctx->flags & IORING_SETUP_CQE32) {
+			index <<= 1;
+			cq_size <<= 1;
+			src_mask = (ctx->cq_entries << 1) - 1;
+			dst_mask = (p->cq_entries << 1) - 1;
+		}
+		memcpy(&n.rings->cqes[index & dst_mask], &o.rings->cqes[index & src_mask], cq_size);
 	}
 	WRITE_ONCE(n.rings->cq.head, old_head);
 	WRITE_ONCE(n.rings->cq.tail, tail);

From 0fcccfd87152f957fa8312b841f6efef42a05a20 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 21 Apr 2026 09:47:04 +0100
Subject: [PATCH 11/15] io_uring/zcrx: fix user_struct uaf

io_free_rbuf_ring() usees a struct user_struct, which
io_zcrx_ifq_free() puts it down before destroying the ring.

Cc: stable@vger.kernel.org
Fixes: 5c686456a4e83 ("io_uring/zcrx: add user_struct and mm_struct to io_zcrx_ifq")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://patch.msgid.link/e560ae00960d27a810522a7efc0e201c82dff351.1776760917.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 9a83d7eb4210..fab3693ecb0d 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -579,13 +579,13 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
 
 	if (ifq->area)
 		io_zcrx_free_area(ifq, ifq->area);
-	free_uid(ifq->user);
 	if (ifq->mm_account)
 		mmdrop(ifq->mm_account);
 	if (ifq->dev)
 		put_device(ifq->dev);
 
 	io_free_rbuf_ring(ifq);
+	free_uid(ifq->user);
 	mutex_destroy(&ifq->pp_lock);
 	kfree(ifq);
 }

From 4f02cc4071a18c78bfff571d796edef055d57daa Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 21 Apr 2026 09:46:44 +0100
Subject: [PATCH 12/15] io_uring/zcrx: clear RQ headers on init

It might be unexpected to users if the RQ head/tail after a ring
creation are not zeroed, fix that.

Cc: stable@vger.kernel.org
Fixes: 6f377873cb239 ("io_uring/zcrx: add interface queue and refill queue")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://patch.msgid.link/331f94663c3e8f021ffa3cb770ca2844a07d4855.1776760911.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index fab3693ecb0d..2eb09219f0a0 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -396,6 +396,7 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
 	ifq->rq.ring = (struct io_uring *)ptr;
 	ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
 
+	memset(ifq->rq.ring, 0, sizeof(*ifq->rq.ring));
 	return 0;
 }
 

From 770594e78c3964cf23cf5287f849437cdde9b7d0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 21 Apr 2026 09:45:29 +0100
Subject: [PATCH 13/15] io_uring/zcrx: warn on freelist violations

The freelist is appropriately sized to always be able to take a free
niov, but let's be more defensive and check the invariant with a
warning. That should help to catch any double-free issues.

Suggested-by: Kai Aizen <kai@snailsploit.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://patch.msgid.link/2f3cea363b04649755e3b6bb9ab66485a95936d5.1776760901.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 2eb09219f0a0..7b93c87b8371 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -602,6 +602,8 @@ static void io_zcrx_return_niov_freelist(struct net_iov *niov)
 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
 
 	guard(spinlock_bh)(&area->freelist_lock);
+	if (WARN_ON_ONCE(area->free_count >= area->nia.num_niovs))
+		return;
 	area->freelist[area->free_count++] = net_iov_idx(niov);
 }
 

From 1967f0b1cafdde37aa9e08e6021c14bcc484b7a5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 21 Apr 2026 13:24:33 -0600
Subject: [PATCH 14/15] io_uring/poll: ensure EPOLL_ONESHOT is propagated for
 EPOLL_URING_WAKE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit:

aacf2f9f382c ("io_uring: fix req->apoll_events")

fixed an issue where poll->events and req->apoll_events weren't
synchronized, but then when the commit referenced in Fixes got added,
it didn't ensure the same thing.

If we mask in EPOLLONESHOT in the regular EPOLL_URING_WAKE path, then
ensure it's done for both. Including a link to the original report
below, even though it's mostly nonsense. But it includes a reproducer
that does show that IORING_CQE_F_MORE is set in the previous CQE,
while no more CQEs will be generated for this request. Just ignore
anything that pretends this is security related in any way, it's just
the typical AI nonsense.

Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/io-uring/CAM0zi7yQzF3eKncgHo4iVM5yFLAjsiob_ucqyWKs=hyd_GqiMg@mail.gmail.com/
Reported-by: Azizcan Daştan <azizcan.d@mileniumsec.com>
Fixes: 4464853277d0 ("io_uring: pass in EPOLL_URING_WAKE for eventfd signaling and wakeups")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 6834e2db937e..0204affdc308 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -417,8 +417,10 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		 * disable multishot as there is a circular dependency between
 		 * CQ posting and triggering the event.
 		 */
-		if (mask & EPOLL_URING_WAKE)
+		if (mask & EPOLL_URING_WAKE) {
 			poll->events |= EPOLLONESHOT;
+			req->apoll_events |= EPOLLONESHOT;
+		}
 
 		/* optional, saves extra locking for removal in tw handler */
 		if (mask && poll->events & EPOLLONESHOT) {

From d0be8884f56b0b800cd8966e37ce23417cd5044e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 21 Apr 2026 15:46:16 +0200
Subject: [PATCH 15/15] io_uring: take page references for NOMMU pbuf_ring
 mmaps

Under !CONFIG_MMU, io_uring_get_unmapped_area() returns the kernel
virtual address of the io_mapped_region's backing pages directly;
the user's VMA aliases the kernel allocation. io_uring_mmap() then
just returns 0 -- it takes no page references.

The CONFIG_MMU path uses vm_insert_pages(), which takes a reference on
each inserted page.  Those references are released when the VMA is torn
down (zap_pte_range -> put_page). io_free_region() -> release_pages()
drops the io_uring-side references, but the pages survive until munmap
drops the VMA-side references.

Under NOMMU there are no VMA-side references. io_unregister_pbuf_ring ->
io_put_bl -> io_free_region -> release_pages drops the only references
and the pages return to the buddy allocator while the user's VMA still
has vm_start pointing into them.  The user can then write into whatever
the allocator hands out next.

Mirror the MMU lifetime: take get_page references in io_uring_mmap() and
release them via vm_ops->close.  NOMMU's delete_vma() calls vma_close()
which runs ->close on munmap.

This also incidentally addresses the duplicate-vm_start case: two mmaps
of SQ_RING and CQ_RING resolve to the same ctx->ring_region pointer.
With page refs taken per mmap, the second mmap takes its own refs and
the pages survive until both mmaps are closed.  The nommu rb-tree BUG_ON
on duplicate vm_start is a separate mm/nommu.c concern (it should share
the existing region rather than BUG), but the page lifetime is now
correct.

Cc: Jens Axboe <axboe@kernel.dk>
Reported-by: Anthropic
Assisted-by: gkh_clanker_t1000
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/2026042115-body-attention-d15b@gregkh
[axboe: get rid of region lookup, just iterate pages in vma]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/memmap.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index e6958968975a..4f9b439319c4 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -366,9 +366,53 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
 
 #else /* !CONFIG_MMU */
 
+/*
+ * Drop the pages that were initially referenced and added in
+ * io_uring_mmap(). We cannot have had a mremap() as that isn't supported,
+ * hence the vma should be identical to the one we initially referenced and
+ * mapped, and partial unmaps and splitting isn't possible on a file backed
+ * mapping.
+ */
+static void io_uring_nommu_vm_close(struct vm_area_struct *vma)
+{
+	unsigned long index;
+
+	for (index = vma->vm_start; index < vma->vm_end; index += PAGE_SIZE)
+		put_page(virt_to_page((void *) index));
+}
+
+static const struct vm_operations_struct io_uring_nommu_vm_ops = {
+	.close = io_uring_nommu_vm_close,
+};
+
 int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
+	struct io_ring_ctx *ctx = file->private_data;
+	struct io_mapped_region *region;
+	unsigned long i;
+
+	if (!is_nommu_shared_mapping(vma->vm_flags))
+		return -EINVAL;
+
+	guard(mutex)(&ctx->mmap_lock);
+	region = io_mmap_get_region(ctx, vma->vm_pgoff);
+	if (!region || !io_region_is_set(region))
+		return -EINVAL;
+
+	if ((vma->vm_end - vma->vm_start) !=
+	    (unsigned long) region->nr_pages << PAGE_SHIFT)
+		return -EINVAL;
+
+	/*
+	 * Pin the pages so io_free_region()'s release_pages() does not
+	 * drop the last reference while this VMA exists. delete_vma()
+	 * in mm/nommu.c calls vma_close() which runs ->close above.
+	 */
+	for (i = 0; i < region->nr_pages; i++)
+		get_page(region->pages[i]);
+
+	vma->vm_ops = &io_uring_nommu_vm_ops;
+	return 0;
 }
 
 unsigned int io_uring_nommu_mmap_capabilities(struct file *file)