Merge tag 'io_uring-7.1-20260424' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring fixes from Jens Axboe:

 - Fix for a NOMMU bug with io_uring, where NOMMU doesn't grab page refs
   at mmap time. NOMMU also has entirely broken FOLL_PIN support, yet
   here we are

 - A few fixes covering minor issues introduced in this merge window

 - data race annotation to shut up KCSAN for when io-wq limits are
   applied

 - A nospec addition for direct descriptor file updating. Rest of the
   direct descriptor path already had this, but for some reason the
   update did not. Now they are all the same

 - Various minor defensive changes that claude identified and suggested
   terrible fixes for, turned into actually useful cleanups:

       - Use kvfree() for the imu cache. These can come from kmalloc or
         vmalloc depending on size, but the in-cache ones are capped
         where it's always kmalloc based. Change to kvfree() in the
         cleanup path, making future changes unlikely to mess that up

       - Negative kbuf consumption lengths. Can't happen right now, but
         cqe->res is used directly, which if other codes changes could
         then be an error value

 - Fix for an issue with the futex code, where partial wakes on a
   vectored fuxes would potentially wake the same futex twice, rather
   than move on to the next one. This could confuse an application as it
   would've expected the next futex to have been woken

 - Fix for a bug with ring resizing, where SQEs or CQEs might not have
   been copied correctly if large SQEs or CQEs are used in the ring.
   Application side issue, where SQEs or CQEs might have been lost
   during resize

 - Fix for a bug where EPOLL_URING_WAKE might have been lost, causing a
   multishot poll to not be terminated when it's nested, like it should
   have been

 - Fix for an issue with signed comparison of poll references for the
   slow path

 - Fix for a user struct UAF in the zcrx code

 - Two minor zcrx cleanups

* tag 'io_uring-7.1-20260424' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
  io_uring: take page references for NOMMU pbuf_ring mmaps
  io_uring/poll: ensure EPOLL_ONESHOT is propagated for EPOLL_URING_WAKE
  io_uring/zcrx: warn on freelist violations
  io_uring/zcrx: clear RQ headers on init
  io_uring/zcrx: fix user_struct uaf
  io_uring/register: fix ring resizing with mixed/large SQEs/CQEs
  io_uring/futex: ensure partial wakes are appropriately dequeued
  io_uring/rw: add defensive hardening for negative kbuf lengths
  io_uring/rsrc: use kvfree() for the imu cache
  io_uring/rsrc: unify nospec indexing for direct descriptors
  io_uring: fix spurious fput in registered ring path
  io_uring: fix iowq_limits data race in tctx node addition
  io_uring/tctx: mark io_wq as exiting before error path teardown
  io_uring/tctx: check for setup tctx->io_wq before teardown
  io_uring/poll: fix signed comparison in io_poll_get_ownership()
This commit is contained in:
Linus Torvalds
2026-04-24 15:00:54 -07:00
11 changed files with 109 additions and 22 deletions

View File

@@ -64,7 +64,7 @@ static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp)
static inline void io_cache_free(struct io_alloc_cache *cache, void *obj)
{
if (!io_alloc_cache_put(cache, obj))
kfree(obj);
kvfree(obj);
}
#endif

View File

@@ -159,8 +159,10 @@ static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q)
struct io_kiocb *req = q->wake_data;
struct io_futexv_data *ifd = req->async_data;
if (!io_futexv_claim(ifd))
if (!io_futexv_claim(ifd)) {
__futex_wake_mark(q);
return;
}
if (unlikely(!__futex_wake_mark(q)))
return;

View File

@@ -2575,7 +2575,8 @@ struct file *io_uring_ctx_get_file(unsigned int fd, bool registered)
return ERR_PTR(-EBADF);
if (io_is_uring_fops(file))
return file;
fput(file);
if (!registered)
fput(file);
return ERR_PTR(-EOPNOTSUPP);
}

View File

@@ -366,9 +366,53 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
#else /* !CONFIG_MMU */
/*
* Drop the pages that were initially referenced and added in
* io_uring_mmap(). We cannot have had a mremap() as that isn't supported,
* hence the vma should be identical to the one we initially referenced and
* mapped, and partial unmaps and splitting isn't possible on a file backed
* mapping.
*/
static void io_uring_nommu_vm_close(struct vm_area_struct *vma)
{
unsigned long index;
for (index = vma->vm_start; index < vma->vm_end; index += PAGE_SIZE)
put_page(virt_to_page((void *) index));
}
static const struct vm_operations_struct io_uring_nommu_vm_ops = {
.close = io_uring_nommu_vm_close,
};
int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
struct io_ring_ctx *ctx = file->private_data;
struct io_mapped_region *region;
unsigned long i;
if (!is_nommu_shared_mapping(vma->vm_flags))
return -EINVAL;
guard(mutex)(&ctx->mmap_lock);
region = io_mmap_get_region(ctx, vma->vm_pgoff);
if (!region || !io_region_is_set(region))
return -EINVAL;
if ((vma->vm_end - vma->vm_start) !=
(unsigned long) region->nr_pages << PAGE_SHIFT)
return -EINVAL;
/*
* Pin the pages so io_free_region()'s release_pages() does not
* drop the last reference while this VMA exists. delete_vma()
* in mm/nommu.c calls vma_close() which runs ->close above.
*/
for (i = 0; i < region->nr_pages; i++)
get_page(region->pages[i]);
vma->vm_ops = &io_uring_nommu_vm_ops;
return 0;
}
unsigned int io_uring_nommu_mmap_capabilities(struct file *file)

View File

@@ -93,7 +93,7 @@ static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
*/
static inline bool io_poll_get_ownership(struct io_kiocb *req)
{
if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
if (unlikely((unsigned int)atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
return io_poll_get_ownership_slowpath(req);
return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}
@@ -417,8 +417,10 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
* disable multishot as there is a circular dependency between
* CQ posting and triggering the event.
*/
if (mask & EPOLL_URING_WAKE)
if (mask & EPOLL_URING_WAKE) {
poll->events |= EPOLLONESHOT;
req->apoll_events |= EPOLLONESHOT;
}
/* optional, saves extra locking for removal in tw handler */
if (mask && poll->events & EPOLLONESHOT) {

View File

@@ -599,10 +599,20 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
if (tail - old_head > p->sq_entries)
goto overflow;
for (i = old_head; i < tail; i++) {
unsigned src_head = i & (ctx->sq_entries - 1);
unsigned dst_head = i & (p->sq_entries - 1);
unsigned index, dst_mask, src_mask;
size_t sq_size;
n.sq_sqes[dst_head] = o.sq_sqes[src_head];
index = i;
sq_size = sizeof(struct io_uring_sqe);
src_mask = ctx->sq_entries - 1;
dst_mask = p->sq_entries - 1;
if (ctx->flags & IORING_SETUP_SQE128) {
index <<= 1;
sq_size <<= 1;
src_mask = (ctx->sq_entries << 1) - 1;
dst_mask = (p->sq_entries << 1) - 1;
}
memcpy(&n.sq_sqes[index & dst_mask], &o.sq_sqes[index & src_mask], sq_size);
}
WRITE_ONCE(n.rings->sq.head, old_head);
WRITE_ONCE(n.rings->sq.tail, tail);
@@ -619,10 +629,20 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
goto out;
}
for (i = old_head; i < tail; i++) {
unsigned src_head = i & (ctx->cq_entries - 1);
unsigned dst_head = i & (p->cq_entries - 1);
unsigned index, dst_mask, src_mask;
size_t cq_size;
n.rings->cqes[dst_head] = o.rings->cqes[src_head];
index = i;
cq_size = sizeof(struct io_uring_cqe);
src_mask = ctx->cq_entries - 1;
dst_mask = p->cq_entries - 1;
if (ctx->flags & IORING_SETUP_CQE32) {
index <<= 1;
cq_size <<= 1;
src_mask = (ctx->cq_entries << 1) - 1;
dst_mask = (p->cq_entries << 1) - 1;
}
memcpy(&n.rings->cqes[index & dst_mask], &o.rings->cqes[index & src_mask], cq_size);
}
WRITE_ONCE(n.rings->cq.head, old_head);
WRITE_ONCE(n.rings->cq.tail, tail);

View File

@@ -168,7 +168,7 @@ bool io_rsrc_cache_init(struct io_ring_ctx *ctx)
void io_rsrc_cache_free(struct io_ring_ctx *ctx)
{
io_alloc_cache_free(&ctx->node_cache, kfree);
io_alloc_cache_free(&ctx->imu_cache, kfree);
io_alloc_cache_free(&ctx->imu_cache, kvfree);
}
static void io_clear_table_tags(struct io_rsrc_data *data)
@@ -238,6 +238,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
continue;
i = up->offset + done;
if (i >= ctx->file_table.data.nr)
break;
i = array_index_nospec(i, ctx->file_table.data.nr);
if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
io_file_bitmap_clear(&ctx->file_table, i);

View File

@@ -109,10 +109,15 @@ static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node
}
static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx,
struct io_rsrc_data *data, int index)
struct io_rsrc_data *data,
unsigned int index)
{
struct io_rsrc_node *node = data->nodes[index];
struct io_rsrc_node *node;
if (index >= data->nr)
return false;
index = array_index_nospec(index, data->nr);
node = data->nodes[index];
if (!node)
return false;
io_put_rsrc_node(ctx, node);

View File

@@ -580,7 +580,7 @@ void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw)
io_req_io_end(req);
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL);
req->cqe.flags |= io_put_kbuf(req, max(req->cqe.res, 0), NULL);
io_req_rw_cleanup(req, 0);
io_req_task_complete(tw_req, tw);
@@ -1379,7 +1379,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
list_del(&req->iopoll_node);
wq_list_add_tail(&req->comp_list, &ctx->submit_state.compl_reqs);
nr_events++;
req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL);
req->cqe.flags = io_put_kbuf(req, max(req->cqe.res, 0), NULL);
if (!io_is_uring_cmd(req))
io_req_rw_cleanup(req, 0);
}

View File

@@ -146,9 +146,13 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
if (IS_ERR(tctx))
return PTR_ERR(tctx);
if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) {
unsigned int limits[2] = { ctx->iowq_limits[0],
ctx->iowq_limits[1], };
if (data_race(ctx->int_flags) & IO_RING_F_IOWQ_LIMITS_SET) {
unsigned int limits[2];
mutex_lock(&ctx->uring_lock);
limits[0] = ctx->iowq_limits[0];
limits[1] = ctx->iowq_limits[1];
mutex_unlock(&ctx->uring_lock);
ret = io_wq_max_workers(tctx->io_wq, limits);
if (ret)
@@ -171,7 +175,10 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
}
if (!current->io_uring) {
err_free:
io_wq_put_and_exit(tctx->io_wq);
if (tctx->io_wq) {
io_wq_exit_start(tctx->io_wq);
io_wq_put_and_exit(tctx->io_wq);
}
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
}

View File

@@ -396,6 +396,7 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
ifq->rq.ring = (struct io_uring *)ptr;
ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
memset(ifq->rq.ring, 0, sizeof(*ifq->rq.ring));
return 0;
}
@@ -579,13 +580,13 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
if (ifq->area)
io_zcrx_free_area(ifq, ifq->area);
free_uid(ifq->user);
if (ifq->mm_account)
mmdrop(ifq->mm_account);
if (ifq->dev)
put_device(ifq->dev);
io_free_rbuf_ring(ifq);
free_uid(ifq->user);
mutex_destroy(&ifq->pp_lock);
kfree(ifq);
}
@@ -601,6 +602,8 @@ static void io_zcrx_return_niov_freelist(struct net_iov *niov)
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
guard(spinlock_bh)(&area->freelist_lock);
if (WARN_ON_ONCE(area->free_count >= area->nia.num_niovs))
return;
area->freelist[area->free_count++] = net_iov_idx(niov);
}