From 84230ad2d2afbf0c44c32967e525c0ad92e26b4e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Dec 2025 13:25:22 -0700 Subject: [PATCH 1/9] io_uring/poll: correctly handle io_poll_add() return value on update When the core of io_uring was updated to handle completions consistently and with fixed return codes, the POLL_REMOVE opcode with updates got slightly broken. If a POLL_ADD is pending and then POLL_REMOVE is used to update the events of that request, if that update causes the POLL_ADD to now trigger, then that completion is lost and a CQE is never posted. Additionally, ensure that if an update does cause an existing POLL_ADD to complete, that the completion value isn't always overwritten with -ECANCELED. For that case, whatever io_poll_add() set the value to should just be retained. Cc: stable@vger.kernel.org Fixes: 97b388d70b53 ("io_uring: handle completions in the core") Reported-by: syzbot+641eec6b7af1f62f2b99@syzkaller.appspotmail.com Tested-by: syzbot+641eec6b7af1f62f2b99@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 8aa4e3a31e73..3f1d716dcfab 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -937,12 +937,17 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED); /* successfully updated, don't complete poll request */ - if (!ret2 || ret2 == -EIOCBQUEUED) + if (ret2 == IOU_ISSUE_SKIP_COMPLETE) goto out; + /* request completed as part of the update, complete it */ + else if (ret2 == IOU_COMPLETE) + goto complete; } - req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); +complete: + if (preq->cqe.res < 0) + req_set_fail(preq); preq->io_task_work.func = io_req_task_complete; io_req_task_work_add(preq); out: From 34c78b8610a9befdcc139d34a7c98365f018026d Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 2 Dec 2025 13:57:44 -0700 Subject: [PATCH 2/9] io_uring/io-wq: always retry worker create on ERESTART* If a task has a pending signal when create_io_thread() is called, copy_process() will return -ERESTARTNOINTR. io_should_retry_thread() will request a retry of create_io_thread() up to WORKER_INIT_LIMIT = 3 times. If all retries fail, the io_uring request will fail with ECANCELED. Commit 3918315c5dc ("io-wq: backoff when retrying worker creation") added a linear backoff to allow the thread to handle its signal before the retry. However, a thread receiving frequent signals may get unlucky and have a signal pending at every retry. Since the userspace task doesn't control when it receives signals, there's no easy way for it to prevent the create_io_thread() failure due to pending signals. The task may also lack the information necessary to regenerate the canceled SQE. So always retry the create_io_thread() on the ERESTART* errors, analogous to what a fork() syscall would do. EAGAIN can occur due to various persistent conditions such as exceeding RLIMIT_NPROC, so respect the WORKER_INIT_LIMIT retry limit for EAGAIN errors. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 1d03b2fc4b25..cd13d8aac3d2 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -805,11 +805,12 @@ static inline bool io_should_retry_thread(struct io_worker *worker, long err) */ if (fatal_signal_pending(current)) return false; - if (worker->init_retries++ >= WORKER_INIT_LIMIT) - return false; + worker->init_retries++; switch (err) { case -EAGAIN: + return worker->init_retries <= WORKER_INIT_LIMIT; + /* Analogous to a fork() syscall, always retry on a restartable error */ case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTARTNOHAND: From f345be751b961ce91e0b883345eaa1d0993a4949 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 2 Dec 2025 11:21:31 -0700 Subject: [PATCH 3/9] io_uring/trace: rename io_uring_queue_async_work event "rw" field The io_uring_queue_async_work tracepoint event stores an int rw field that represents whether the work item is hashed. Rename it to "hashed" and change its type to bool to more accurately reflect its value. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 45d15460b495..34b31a855ea4 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -133,15 +133,15 @@ TRACE_EVENT(io_uring_file_get, * io_uring_queue_async_work - called before submitting a new async work * * @req: pointer to a submitted request - * @rw: type of workqueue, hashed or normal + * @hashed: whether async work is hashed * * Allows to trace asynchronous work submission. */ TRACE_EVENT(io_uring_queue_async_work, - TP_PROTO(struct io_kiocb *req, int rw), + TP_PROTO(struct io_kiocb *req, bool hashed), - TP_ARGS(req, rw), + TP_ARGS(req, hashed), TP_STRUCT__entry ( __field( void *, ctx ) @@ -150,7 +150,7 @@ TRACE_EVENT(io_uring_queue_async_work, __field( u8, opcode ) __field( unsigned long long, flags ) __field( struct io_wq_work *, work ) - __field( int, rw ) + __field( bool, hashed ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), @@ -162,7 +162,7 @@ TRACE_EVENT(io_uring_queue_async_work, __entry->flags = (__force unsigned long long) req->flags; __entry->opcode = req->opcode; __entry->work = &req->work; - __entry->rw = rw; + __entry->hashed = hashed; __assign_str(op_str); ), @@ -170,7 +170,7 @@ TRACE_EVENT(io_uring_queue_async_work, TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, - __entry->rw ? "hashed" : "normal", __entry->work) + __entry->hashed ? "hashed" : "normal", __entry->work) ); /** From b8201b50e403815f941d1c6581a27fdbfe7d0fd4 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 4 Dec 2025 13:51:14 -0800 Subject: [PATCH 4/9] io_uring/rsrc: clean up buffer cloning arg validation Get rid of some redundant checks and move the src arg validation to before the buffer table allocation, which simplifies error handling. Signed-off-by: Joanne Koong Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 3765a50329a8..5ad3d10413eb 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1186,12 +1186,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx return -EBUSY; nbufs = src_ctx->buf_table.nr; + if (!nbufs) + return -ENXIO; if (!arg->nr) arg->nr = nbufs; else if (arg->nr > nbufs) return -EINVAL; else if (arg->nr > IORING_MAX_REG_BUFFERS) return -EINVAL; + if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs) + return -EOVERFLOW; if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) return -EOVERFLOW; if (nbufs > IORING_MAX_REG_BUFFERS) @@ -1211,21 +1215,6 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx } } - ret = -ENXIO; - nbufs = src_ctx->buf_table.nr; - if (!nbufs) - goto out_free; - ret = -EINVAL; - if (!arg->nr) - arg->nr = nbufs; - else if (arg->nr > nbufs) - goto out_free; - ret = -EOVERFLOW; - if (check_add_overflow(arg->nr, arg->src_off, &off)) - goto out_free; - if (off > nbufs) - goto out_free; - off = arg->dst_off; i = arg->src_off; nr = arg->nr; @@ -1238,8 +1227,8 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx } else { dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!dst_node) { - ret = -ENOMEM; - goto out_free; + io_rsrc_data_free(ctx, &data); + return -ENOMEM; } refcount_inc(&src_node->buf->refs); @@ -1265,10 +1254,6 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx WARN_ON_ONCE(ctx->buf_table.nr); ctx->buf_table = data; return 0; - -out_free: - io_rsrc_data_free(ctx, &data); - return ret; } /* From e29af2aba262833c8eba578b58d6bbb6b0866a67 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 4 Dec 2025 13:51:15 -0800 Subject: [PATCH 5/9] io_uring/rsrc: rename misleading src_node variable in io_clone_buffers() The variable holds nodes from the destination ring's existing buffer table. In io_clone_buffers(), the term "src" is used to refer to the source ring. Rename to node for clarity. Signed-off-by: Joanne Koong Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 5ad3d10413eb..04f56212398a 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1207,11 +1207,11 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx /* Fill entries in data from dst that won't overlap with src */ for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { - struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; + struct io_rsrc_node *node = ctx->buf_table.nodes[i]; - if (src_node) { - data.nodes[i] = src_node; - src_node->refs++; + if (node) { + data.nodes[i] = node; + node->refs++; } } From 525916ce496615f531091855604eab9ca573b195 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 4 Dec 2025 13:51:16 -0800 Subject: [PATCH 6/9] io_uring/rsrc: fix lost entries after cloned range When cloning with node replacements (IORING_REGISTER_DST_REPLACE), destination entries after the cloned range are not copied over. Add logic to copy them over to the new destination table. Fixes: c1329532d5aa ("io_uring/rsrc: allow cloning with node replacements") Cc: stable@vger.kernel.org Signed-off-by: Joanne Koong Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 04f56212398a..a63474b331bf 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1205,7 +1205,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx if (ret) return ret; - /* Fill entries in data from dst that won't overlap with src */ + /* Copy original dst nodes from before the cloned range */ for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { struct io_rsrc_node *node = ctx->buf_table.nodes[i]; @@ -1238,6 +1238,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx i++; } + /* Copy original dst nodes from after the cloned range */ + for (i = nbufs; i < ctx->buf_table.nr; i++) { + struct io_rsrc_node *node = ctx->buf_table.nodes[i]; + + if (node) { + data.nodes[i] = node; + node->refs++; + } + } + /* * If asked for replace, put the old table. data->nodes[] holds both * old and new nodes at this point. From 78385c7299f7514697d196b3233a91bd5e485591 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 4 Dec 2025 15:43:31 -0700 Subject: [PATCH 7/9] io_uring/kbuf: use READ_ONCE() for userspace-mapped memory The struct io_uring_buf elements in a buffer ring are in a memory region accessible from userspace. A malicious/buggy userspace program could therefore write to them at any time, so they should be accessed with READ_ONCE() in the kernel. Commit 98b6fa62c84f ("io_uring/kbuf: always use READ_ONCE() to read ring provided buffer lengths") already switched the reads of the len field to READ_ONCE(). Do the same for bid and addr. Signed-off-by: Caleb Sander Mateos Fixes: c7fb19428d67 ("io_uring: add support for ring mapped supplied buffers") Cc: Joanne Koong Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 8a329556f8df..52b636d00a6b 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -44,7 +44,7 @@ static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len) buf_len -= this_len; /* Stop looping for invalid buffer length of 0 */ if (buf_len || !this_len) { - buf->addr += this_len; + buf->addr = READ_ONCE(buf->addr) + this_len; buf->len = buf_len; return false; } @@ -198,9 +198,9 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, if (*len == 0 || *len > buf_len) *len = buf_len; req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; - req->buf_index = buf->bid; + req->buf_index = READ_ONCE(buf->bid); sel.buf_list = bl; - sel.addr = u64_to_user_ptr(buf->addr); + sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr)); if (io_should_commit(req, issue_flags)) { io_kbuf_commit(req, sel.buf_list, *len, 1); @@ -280,7 +280,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, if (!arg->max_len) arg->max_len = INT_MAX; - req->buf_index = buf->bid; + req->buf_index = READ_ONCE(buf->bid); do { u32 len = READ_ONCE(buf->len); @@ -295,7 +295,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, } } - iov->iov_base = u64_to_user_ptr(buf->addr); + iov->iov_base = u64_to_user_ptr(READ_ONCE(buf->addr)); iov->iov_len = len; iov++; From a4c694bfc2455e82b7caf6045ca893d123e0ed11 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 4 Dec 2025 15:54:50 -0800 Subject: [PATCH 8/9] io_uring/kbuf: use WRITE_ONCE() for userspace-shared buffer ring fields buf->addr and buf->len reside in memory shared with userspace. They should be written with WRITE_ONCE() to guarantee atomic stores and prevent tearing or other unsafe compiler optimizations. Signed-off-by: Joanne Koong Cc: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 52b636d00a6b..796d131107dd 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -44,11 +44,11 @@ static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len) buf_len -= this_len; /* Stop looping for invalid buffer length of 0 */ if (buf_len || !this_len) { - buf->addr = READ_ONCE(buf->addr) + this_len; - buf->len = buf_len; + WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len); + WRITE_ONCE(buf->len, buf_len); return false; } - buf->len = 0; + WRITE_ONCE(buf->len, 0); bl->head++; len -= this_len; } @@ -291,7 +291,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, arg->partial_map = 1; if (iov != arg->iovs) break; - buf->len = len; + WRITE_ONCE(buf->len, len); } } From 55d57b3bcc7efcab812a8179e2dc17d781302997 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Dec 2025 10:20:47 -0700 Subject: [PATCH 9/9] io_uring/poll: unify poll waitqueue entry and list removal For some cases, the order in which the waitq entry list and head writing happens is important, for others it doesn't really matter. But it's somewhat confusing to have them spread out over the file. Abstract out the nicely documented code in io_pollfree_wake() and move it into a helper, and use that helper consistently rather than having other call sites manually do the same thing. While at it, correct a comment function name as well. Signed-off-by: Jens Axboe --- io_uring/poll.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 3f1d716dcfab..aac4b3b881fb 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -138,14 +138,32 @@ static void io_init_poll_iocb(struct io_poll *poll, __poll_t events) init_waitqueue_func_entry(&poll->wait, io_poll_wake); } +static void io_poll_remove_waitq(struct io_poll *poll) +{ + /* + * If the waitqueue is being freed early but someone is already holds + * ownership over it, we have to tear down the request as best we can. + * That means immediately removing the request from its waitqueue and + * preventing all further accesses to the waitqueue via the request. + */ + list_del_init(&poll->wait.entry); + + /* + * Careful: this *must* be the last step, since as soon as req->head is + * NULL'ed out, the request can be completed and freed, since + * io_poll_remove_entry() will no longer need to take the waitqueue + * lock. + */ + smp_store_release(&poll->head, NULL); +} + static inline void io_poll_remove_entry(struct io_poll *poll) { struct wait_queue_head *head = smp_load_acquire(&poll->head); if (head) { spin_lock_irq(&head->lock); - list_del_init(&poll->wait.entry); - poll->head = NULL; + io_poll_remove_waitq(poll); spin_unlock_irq(&head->lock); } } @@ -368,23 +386,7 @@ static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll) io_poll_mark_cancelled(req); /* we have to kick tw in case it's not already */ io_poll_execute(req, 0); - - /* - * If the waitqueue is being freed early but someone is already - * holds ownership over it, we have to tear down the request as - * best we can. That means immediately removing the request from - * its waitqueue and preventing all further accesses to the - * waitqueue via the request. - */ - list_del_init(&poll->wait.entry); - - /* - * Careful: this *must* be the last step, since as soon - * as req->head is NULL'ed out, the request can be - * completed and freed, since aio_poll_complete_work() - * will no longer need to take the waitqueue lock. - */ - smp_store_release(&poll->head, NULL); + io_poll_remove_waitq(poll); return 1; } @@ -413,8 +415,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, /* optional, saves extra locking for removal in tw handler */ if (mask && poll->events & EPOLLONESHOT) { - list_del_init(&poll->wait.entry); - poll->head = NULL; + io_poll_remove_waitq(poll); if (wqe_is_double(wait)) req->flags &= ~REQ_F_DOUBLE_POLL; else