From 326941b22806cbf2df1fbfe902b7908b368cce42 Mon Sep 17 00:00:00 2001 From: Longxuan Yu Date: Sun, 12 Apr 2026 16:38:20 +0800 Subject: [PATCH 01/15] io_uring/poll: fix signed comparison in io_poll_get_ownership() io_poll_get_ownership() uses a signed comparison to check whether poll_refs has reached the threshold for the slowpath: if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) atomic_read() returns int (signed). When IO_POLL_CANCEL_FLAG (BIT(31)) is set in poll_refs, the value becomes negative in signed arithmetic, so the >= 128 comparison always evaluates to false and the slowpath is never taken. Fix this by casting the atomic_read() result to unsigned int before the comparison, so that the cancel flag is treated as a large positive value and correctly triggers the slowpath. Fixes: a26a35e9019f ("io_uring: make poll refs more robust") Cc: stable@vger.kernel.org Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Tested-by: Zhengchuan Liang Signed-off-by: Longxuan Yu Signed-off-by: Ren Wei Reviewed-by: Pavel Begunkov Link: https://patch.msgid.link/3a3508b08bcd7f1bc3beff848ae6e1d73d355043.1775965597.git.ylong030@ucr.edu Signed-off-by: Jens Axboe --- io_uring/poll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 74eef7884159..6834e2db937e 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -93,7 +93,7 @@ static bool io_poll_get_ownership_slowpath(struct io_kiocb *req) */ static inline bool io_poll_get_ownership(struct io_kiocb *req) { - if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) + if (unlikely((unsigned int)atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) return io_poll_get_ownership_slowpath(req); return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); } From ee5417fd02cabb6235a89daf5142ffde9aa957fd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 15 Apr 2026 14:22:16 -0600 Subject: [PATCH 02/15] io_uring/tctx: check for setup tctx->io_wq before teardown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As with the idling code before it, the error exit path should check for a NULL tctx->io_wq before calling io_wq_put_and_exit(). Fixes: 7880174e1e5e ("io_uring/tctx: clean up __io_uring_add_tctx_node() error handling") Reported-by: Dan Carpenter Reviewed-by: Clément Léger Signed-off-by: Jens Axboe --- io_uring/tctx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 61533f30494f..c011a593c0ad 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -171,7 +171,8 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) } if (!current->io_uring) { err_free: - io_wq_put_and_exit(tctx->io_wq); + if (tctx->io_wq) + io_wq_put_and_exit(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); kfree(tctx); } From 41859843f27dd5c8d3bc43489ad9196c96d39f2b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 16 Apr 2026 10:05:41 -0600 Subject: [PATCH 03/15] io_uring/tctx: mark io_wq as exiting before error path teardown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzbot reports that it's hitting the below condition for exiting an io_wq context: WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state)) in io_wq_put_and_exit(), which can be triggered with memory allocation fault injection. Ensure that the io_wq is marked as exiting to silence this warning trigger. Reported-by: syzbot+79a4cc863a8db58cd92b@syzkaller.appspotmail.com Fixes: 7880174e1e5e ("io_uring/tctx: clean up __io_uring_add_tctx_node() error handling") Reviewed-by: Clément Léger Signed-off-by: Jens Axboe --- io_uring/tctx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index c011a593c0ad..80366320276d 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -171,8 +171,10 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) } if (!current->io_uring) { err_free: - if (tctx->io_wq) + if (tctx->io_wq) { + io_wq_exit_start(tctx->io_wq); io_wq_put_and_exit(tctx->io_wq); + } percpu_counter_destroy(&tctx->inflight); kfree(tctx); } From 42a702aaedf54aa8056fc429fc757a600182e5f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 20 Apr 2026 14:04:00 +0000 Subject: [PATCH 04/15] io_uring: fix iowq_limits data race in tctx node addition __io_uring_add_tctx_node() reads ctx->int_flags and ctx->iowq_limits[0..1] without holding ctx->uring_lock, while io_register_iowq_max_workers() writes these same fields under the lock. Mostly an application problem if you try and make these race, but let's silence KCSAN by just grabbing the ->uring_lock around the operation. This is a slow path operation anyway, and ->uring_lock will be grabbed by submission right after anyway. Fixes: 2e480058ddc2 ("io-wq: provide a way to limit max number of workers") Signed-off-by: Jens Axboe --- io_uring/tctx.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 80366320276d..6af62ca9baba 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -146,9 +146,13 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) if (IS_ERR(tctx)) return PTR_ERR(tctx); - if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) { - unsigned int limits[2] = { ctx->iowq_limits[0], - ctx->iowq_limits[1], }; + if (data_race(ctx->int_flags) & IO_RING_F_IOWQ_LIMITS_SET) { + unsigned int limits[2]; + + mutex_lock(&ctx->uring_lock); + limits[0] = ctx->iowq_limits[0]; + limits[1] = ctx->iowq_limits[1]; + mutex_unlock(&ctx->uring_lock); ret = io_wq_max_workers(tctx->io_wq, limits); if (ret) From 8e1f412b5bc690cb72b3303a1ae0d42955e5e2b3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 20 Apr 2026 14:06:00 +0000 Subject: [PATCH 05/15] io_uring: fix spurious fput in registered ring path Fix an issue with io_uring_ctx_get_file() not gating fput() on whether or not the file descriptor is a registered/direct one or not. Fixes: c5e9f6a96bf7 ("io_uring: unify getting ctx from passed in file descriptor") Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index dd6326dc5f88..4ed998d60c09 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2575,7 +2575,8 @@ struct file *io_uring_ctx_get_file(unsigned int fd, bool registered) return ERR_PTR(-EBADF); if (io_is_uring_fops(file)) return file; - fput(file); + if (!registered) + fput(file); return ERR_PTR(-EOPNOTSUPP); } From 53262c91f7b81f96495ff24e9d1fa8b1632e69c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 20 Apr 2026 13:14:54 -0600 Subject: [PATCH 06/15] io_uring/rsrc: unify nospec indexing for direct descriptors For file updates, the node reset isn't capping the value via array_index_nospec() like the other paths do. Ensure it's all sane and have the update path do the proper capping as well. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 3 +++ io_uring/rsrc.h | 9 +++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index fd36e0e319a2..c042054c3b5f 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -238,6 +238,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, continue; i = up->offset + done; + if (i >= ctx->file_table.data.nr) + break; + i = array_index_nospec(i, ctx->file_table.data.nr); if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) io_file_bitmap_clear(&ctx->file_table, i); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index cff0f8834c35..44e3386f7c1c 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -109,10 +109,15 @@ static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node } static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx, - struct io_rsrc_data *data, int index) + struct io_rsrc_data *data, + unsigned int index) { - struct io_rsrc_node *node = data->nodes[index]; + struct io_rsrc_node *node; + if (index >= data->nr) + return false; + index = array_index_nospec(index, data->nr); + node = data->nodes[index]; if (!node) return false; io_put_rsrc_node(ctx, node); From 02b8d41c17630493f63c7785c873e327fa9b76a6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 20 Apr 2026 13:15:41 -0600 Subject: [PATCH 07/15] io_uring/rsrc: use kvfree() for the imu cache Currently anything that requires kvmalloc_flex() for allocations will not get re-cached, and hence the cache freeing path is correct in that it always uses kfree() to free the allocated memory. But this seems a bit fragile as it's something that could get mix should that situation change, so switch io_free_imu() and io_alloc_cache_free() to use kvfree as the desctructor. Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 2 +- io_uring/rsrc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 45fcd8b3b824..962b6e2d04cc 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -64,7 +64,7 @@ static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp) static inline void io_cache_free(struct io_alloc_cache *cache, void *obj) { if (!io_alloc_cache_put(cache, obj)) - kfree(obj); + kvfree(obj); } #endif diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index c042054c3b5f..650303626be6 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -168,7 +168,7 @@ bool io_rsrc_cache_init(struct io_ring_ctx *ctx) void io_rsrc_cache_free(struct io_ring_ctx *ctx) { io_alloc_cache_free(&ctx->node_cache, kfree); - io_alloc_cache_free(&ctx->imu_cache, kfree); + io_alloc_cache_free(&ctx->imu_cache, kvfree); } static void io_clear_table_tags(struct io_rsrc_data *data) From 79968834558774bdc5de4b5503d412df632646aa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 20 Apr 2026 13:16:19 -0600 Subject: [PATCH 08/15] io_uring/rw: add defensive hardening for negative kbuf lengths No real bug here, just being a bit defensive in ensuring that whatever gets passed into io_put_kbuf() is always >= 0 and not some random error value. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/rw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 20654deff84d..e729e0e7657e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -580,7 +580,7 @@ void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw) io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) - req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL); + req->cqe.flags |= io_put_kbuf(req, max(req->cqe.res, 0), NULL); io_req_rw_cleanup(req, 0); io_req_task_complete(tw_req, tw); @@ -1379,7 +1379,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) list_del(&req->iopoll_node); wq_list_add_tail(&req->comp_list, &ctx->submit_state.compl_reqs); nr_events++; - req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL); + req->cqe.flags = io_put_kbuf(req, max(req->cqe.res, 0), NULL); if (!io_is_uring_cmd(req)) io_req_rw_cleanup(req, 0); } From 7faaa6812aba550c24bffdfd9399568223c8a477 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 20 Apr 2026 14:24:50 -0600 Subject: [PATCH 09/15] io_uring/futex: ensure partial wakes are appropriately dequeued If a FUTEX_WAITV vectored operation is only partially woken, we should call __futex_wake_mark() on the queue to account for that. If not, then a later wakeup will wake the same entry, rather than the next one in line. Fixes: 8f350194d5cfd ("io_uring: add support for vectored futex waits") Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/futex.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/futex.c b/io_uring/futex.c index fd503c24b428..9cc1788ef4c6 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -159,8 +159,10 @@ static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q) struct io_kiocb *req = q->wake_data; struct io_futexv_data *ifd = req->async_data; - if (!io_futexv_claim(ifd)) + if (!io_futexv_claim(ifd)) { + __futex_wake_mark(q); return; + } if (unlikely(!__futex_wake_mark(q))) return; From 45cd95763e198d74d369ede43aef0b1955b8dea4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 20 Apr 2026 13:41:38 -0600 Subject: [PATCH 10/15] io_uring/register: fix ring resizing with mixed/large SQEs/CQEs The ring resizing only properly handles "normal" sized SQEs or CQEs, if there are pending entries around a resize. This normally should not be the case, but the code is supposed to handle this regardless. For the mixed SQE/CQE cases, the current copying works fine as they are indexed in the same way. Each half is just copied separately. But for fixed large SQEs and CQEs, the iteration and copy need to take that into account. Cc: stable@kernel.org Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/register.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index 24e593332d1a..dce5e2f9cf77 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -599,10 +599,20 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) if (tail - old_head > p->sq_entries) goto overflow; for (i = old_head; i < tail; i++) { - unsigned src_head = i & (ctx->sq_entries - 1); - unsigned dst_head = i & (p->sq_entries - 1); + unsigned index, dst_mask, src_mask; + size_t sq_size; - n.sq_sqes[dst_head] = o.sq_sqes[src_head]; + index = i; + sq_size = sizeof(struct io_uring_sqe); + src_mask = ctx->sq_entries - 1; + dst_mask = p->sq_entries - 1; + if (ctx->flags & IORING_SETUP_SQE128) { + index <<= 1; + sq_size <<= 1; + src_mask = (ctx->sq_entries << 1) - 1; + dst_mask = (p->sq_entries << 1) - 1; + } + memcpy(&n.sq_sqes[index & dst_mask], &o.sq_sqes[index & src_mask], sq_size); } WRITE_ONCE(n.rings->sq.head, old_head); WRITE_ONCE(n.rings->sq.tail, tail); @@ -619,10 +629,20 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) goto out; } for (i = old_head; i < tail; i++) { - unsigned src_head = i & (ctx->cq_entries - 1); - unsigned dst_head = i & (p->cq_entries - 1); + unsigned index, dst_mask, src_mask; + size_t cq_size; - n.rings->cqes[dst_head] = o.rings->cqes[src_head]; + index = i; + cq_size = sizeof(struct io_uring_cqe); + src_mask = ctx->cq_entries - 1; + dst_mask = p->cq_entries - 1; + if (ctx->flags & IORING_SETUP_CQE32) { + index <<= 1; + cq_size <<= 1; + src_mask = (ctx->cq_entries << 1) - 1; + dst_mask = (p->cq_entries << 1) - 1; + } + memcpy(&n.rings->cqes[index & dst_mask], &o.rings->cqes[index & src_mask], cq_size); } WRITE_ONCE(n.rings->cq.head, old_head); WRITE_ONCE(n.rings->cq.tail, tail); From 0fcccfd87152f957fa8312b841f6efef42a05a20 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 21 Apr 2026 09:47:04 +0100 Subject: [PATCH 11/15] io_uring/zcrx: fix user_struct uaf io_free_rbuf_ring() usees a struct user_struct, which io_zcrx_ifq_free() puts it down before destroying the ring. Cc: stable@vger.kernel.org Fixes: 5c686456a4e83 ("io_uring/zcrx: add user_struct and mm_struct to io_zcrx_ifq") Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/e560ae00960d27a810522a7efc0e201c82dff351.1776760917.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 9a83d7eb4210..fab3693ecb0d 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -579,13 +579,13 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) if (ifq->area) io_zcrx_free_area(ifq, ifq->area); - free_uid(ifq->user); if (ifq->mm_account) mmdrop(ifq->mm_account); if (ifq->dev) put_device(ifq->dev); io_free_rbuf_ring(ifq); + free_uid(ifq->user); mutex_destroy(&ifq->pp_lock); kfree(ifq); } From 4f02cc4071a18c78bfff571d796edef055d57daa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 21 Apr 2026 09:46:44 +0100 Subject: [PATCH 12/15] io_uring/zcrx: clear RQ headers on init It might be unexpected to users if the RQ head/tail after a ring creation are not zeroed, fix that. Cc: stable@vger.kernel.org Fixes: 6f377873cb239 ("io_uring/zcrx: add interface queue and refill queue") Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/331f94663c3e8f021ffa3cb770ca2844a07d4855.1776760911.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index fab3693ecb0d..2eb09219f0a0 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -396,6 +396,7 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, ifq->rq.ring = (struct io_uring *)ptr; ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off); + memset(ifq->rq.ring, 0, sizeof(*ifq->rq.ring)); return 0; } From 770594e78c3964cf23cf5287f849437cdde9b7d0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 21 Apr 2026 09:45:29 +0100 Subject: [PATCH 13/15] io_uring/zcrx: warn on freelist violations The freelist is appropriately sized to always be able to take a free niov, but let's be more defensive and check the invariant with a warning. That should help to catch any double-free issues. Suggested-by: Kai Aizen Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/2f3cea363b04649755e3b6bb9ab66485a95936d5.1776760901.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 2eb09219f0a0..7b93c87b8371 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -602,6 +602,8 @@ static void io_zcrx_return_niov_freelist(struct net_iov *niov) struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); guard(spinlock_bh)(&area->freelist_lock); + if (WARN_ON_ONCE(area->free_count >= area->nia.num_niovs)) + return; area->freelist[area->free_count++] = net_iov_idx(niov); } From 1967f0b1cafdde37aa9e08e6021c14bcc484b7a5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 21 Apr 2026 13:24:33 -0600 Subject: [PATCH 14/15] io_uring/poll: ensure EPOLL_ONESHOT is propagated for EPOLL_URING_WAKE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit: aacf2f9f382c ("io_uring: fix req->apoll_events") fixed an issue where poll->events and req->apoll_events weren't synchronized, but then when the commit referenced in Fixes got added, it didn't ensure the same thing. If we mask in EPOLLONESHOT in the regular EPOLL_URING_WAKE path, then ensure it's done for both. Including a link to the original report below, even though it's mostly nonsense. But it includes a reproducer that does show that IORING_CQE_F_MORE is set in the previous CQE, while no more CQEs will be generated for this request. Just ignore anything that pretends this is security related in any way, it's just the typical AI nonsense. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/io-uring/CAM0zi7yQzF3eKncgHo4iVM5yFLAjsiob_ucqyWKs=hyd_GqiMg@mail.gmail.com/ Reported-by: Azizcan Daştan Fixes: 4464853277d0 ("io_uring: pass in EPOLL_URING_WAKE for eventfd signaling and wakeups") Signed-off-by: Jens Axboe --- io_uring/poll.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 6834e2db937e..0204affdc308 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -417,8 +417,10 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * disable multishot as there is a circular dependency between * CQ posting and triggering the event. */ - if (mask & EPOLL_URING_WAKE) + if (mask & EPOLL_URING_WAKE) { poll->events |= EPOLLONESHOT; + req->apoll_events |= EPOLLONESHOT; + } /* optional, saves extra locking for removal in tw handler */ if (mask && poll->events & EPOLLONESHOT) { From d0be8884f56b0b800cd8966e37ce23417cd5044e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 21 Apr 2026 15:46:16 +0200 Subject: [PATCH 15/15] io_uring: take page references for NOMMU pbuf_ring mmaps Under !CONFIG_MMU, io_uring_get_unmapped_area() returns the kernel virtual address of the io_mapped_region's backing pages directly; the user's VMA aliases the kernel allocation. io_uring_mmap() then just returns 0 -- it takes no page references. The CONFIG_MMU path uses vm_insert_pages(), which takes a reference on each inserted page. Those references are released when the VMA is torn down (zap_pte_range -> put_page). io_free_region() -> release_pages() drops the io_uring-side references, but the pages survive until munmap drops the VMA-side references. Under NOMMU there are no VMA-side references. io_unregister_pbuf_ring -> io_put_bl -> io_free_region -> release_pages drops the only references and the pages return to the buddy allocator while the user's VMA still has vm_start pointing into them. The user can then write into whatever the allocator hands out next. Mirror the MMU lifetime: take get_page references in io_uring_mmap() and release them via vm_ops->close. NOMMU's delete_vma() calls vma_close() which runs ->close on munmap. This also incidentally addresses the duplicate-vm_start case: two mmaps of SQ_RING and CQ_RING resolve to the same ctx->ring_region pointer. With page refs taken per mmap, the second mmap takes its own refs and the pages survive until both mmaps are closed. The nommu rb-tree BUG_ON on duplicate vm_start is a separate mm/nommu.c concern (it should share the existing region rather than BUG), but the page lifetime is now correct. Cc: Jens Axboe Reported-by: Anthropic Assisted-by: gkh_clanker_t1000 Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2026042115-body-attention-d15b@gregkh [axboe: get rid of region lookup, just iterate pages in vma] Signed-off-by: Jens Axboe --- io_uring/memmap.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index e6958968975a..4f9b439319c4 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -366,9 +366,53 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, #else /* !CONFIG_MMU */ +/* + * Drop the pages that were initially referenced and added in + * io_uring_mmap(). We cannot have had a mremap() as that isn't supported, + * hence the vma should be identical to the one we initially referenced and + * mapped, and partial unmaps and splitting isn't possible on a file backed + * mapping. + */ +static void io_uring_nommu_vm_close(struct vm_area_struct *vma) +{ + unsigned long index; + + for (index = vma->vm_start; index < vma->vm_end; index += PAGE_SIZE) + put_page(virt_to_page((void *) index)); +} + +static const struct vm_operations_struct io_uring_nommu_vm_ops = { + .close = io_uring_nommu_vm_close, +}; + int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { - return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; + struct io_ring_ctx *ctx = file->private_data; + struct io_mapped_region *region; + unsigned long i; + + if (!is_nommu_shared_mapping(vma->vm_flags)) + return -EINVAL; + + guard(mutex)(&ctx->mmap_lock); + region = io_mmap_get_region(ctx, vma->vm_pgoff); + if (!region || !io_region_is_set(region)) + return -EINVAL; + + if ((vma->vm_end - vma->vm_start) != + (unsigned long) region->nr_pages << PAGE_SHIFT) + return -EINVAL; + + /* + * Pin the pages so io_free_region()'s release_pages() does not + * drop the last reference while this VMA exists. delete_vma() + * in mm/nommu.c calls vma_close() which runs ->close above. + */ + for (i = 0; i < region->nr_pages; i++) + get_page(region->pages[i]); + + vma->vm_ops = &io_uring_nommu_vm_ops; + return 0; } unsigned int io_uring_nommu_mmap_capabilities(struct file *file)