Merge tag 'io_uring-7.1-20260424' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring fixes from Jens Axboe: - Fix for a NOMMU bug with io_uring, where NOMMU doesn't grab page refs at mmap time. NOMMU also has entirely broken FOLL_PIN support, yet here we are - A few fixes covering minor issues introduced in this merge window - data race annotation to shut up KCSAN for when io-wq limits are applied - A nospec addition for direct descriptor file updating. Rest of the direct descriptor path already had this, but for some reason the update did not. Now they are all the same - Various minor defensive changes that claude identified and suggested terrible fixes for, turned into actually useful cleanups: - Use kvfree() for the imu cache. These can come from kmalloc or vmalloc depending on size, but the in-cache ones are capped where it's always kmalloc based. Change to kvfree() in the cleanup path, making future changes unlikely to mess that up - Negative kbuf consumption lengths. Can't happen right now, but cqe->res is used directly, which if other codes changes could then be an error value - Fix for an issue with the futex code, where partial wakes on a vectored fuxes would potentially wake the same futex twice, rather than move on to the next one. This could confuse an application as it would've expected the next futex to have been woken - Fix for a bug with ring resizing, where SQEs or CQEs might not have been copied correctly if large SQEs or CQEs are used in the ring. Application side issue, where SQEs or CQEs might have been lost during resize - Fix for a bug where EPOLL_URING_WAKE might have been lost, causing a multishot poll to not be terminated when it's nested, like it should have been - Fix for an issue with signed comparison of poll references for the slow path - Fix for a user struct UAF in the zcrx code - Two minor zcrx cleanups * tag 'io_uring-7.1-20260424' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: io_uring: take page references for NOMMU pbuf_ring mmaps io_uring/poll: ensure EPOLL_ONESHOT is propagated for EPOLL_URING_WAKE io_uring/zcrx: warn on freelist violations io_uring/zcrx: clear RQ headers on init io_uring/zcrx: fix user_struct uaf io_uring/register: fix ring resizing with mixed/large SQEs/CQEs io_uring/futex: ensure partial wakes are appropriately dequeued io_uring/rw: add defensive hardening for negative kbuf lengths io_uring/rsrc: use kvfree() for the imu cache io_uring/rsrc: unify nospec indexing for direct descriptors io_uring: fix spurious fput in registered ring path io_uring: fix iowq_limits data race in tctx node addition io_uring/tctx: mark io_wq as exiting before error path teardown io_uring/tctx: check for setup tctx->io_wq before teardown io_uring/poll: fix signed comparison in io_poll_get_ownership()
2026-05-16 02:01:18 -04:00 · 2026-04-24 15:00:54 -07:00
parent b85900e91c d0be8884f5
commit fa58e6e900
11 changed files with 109 additions and 22 deletions
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -64,7 +64,7 @@ static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp)
 static inline void io_cache_free(struct io_alloc_cache *cache, void *obj)
 {
 	if (!io_alloc_cache_put(cache, obj))
-		kfree(obj);
+		kvfree(obj);
 }

 #endif
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -159,8 +159,10 @@ static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q)
 	struct io_kiocb *req = q->wake_data;
 	struct io_futexv_data *ifd = req->async_data;

-	if (!io_futexv_claim(ifd))
+	if (!io_futexv_claim(ifd)) {
+		__futex_wake_mark(q);
 		return;
+	}
 	if (unlikely(!__futex_wake_mark(q)))
 		return;

--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2575,7 +2575,8 @@ struct file *io_uring_ctx_get_file(unsigned int fd, bool registered)
 		return ERR_PTR(-EBADF);
 	if (io_is_uring_fops(file))
 		return file;
-	fput(file);
+	if (!registered)
+		fput(file);
 	return ERR_PTR(-EOPNOTSUPP);
 }

--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -366,9 +366,53 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,

 #else /* !CONFIG_MMU */

+/*
+ * Drop the pages that were initially referenced and added in
+ * io_uring_mmap(). We cannot have had a mremap() as that isn't supported,
+ * hence the vma should be identical to the one we initially referenced and
+ * mapped, and partial unmaps and splitting isn't possible on a file backed
+ * mapping.
+ */
+static void io_uring_nommu_vm_close(struct vm_area_struct *vma)
+{
+	unsigned long index;
+
+	for (index = vma->vm_start; index < vma->vm_end; index += PAGE_SIZE)
+		put_page(virt_to_page((void *) index));
+}
+
+static const struct vm_operations_struct io_uring_nommu_vm_ops = {
+	.close = io_uring_nommu_vm_close,
+};
+
 int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
+	struct io_ring_ctx *ctx = file->private_data;
+	struct io_mapped_region *region;
+	unsigned long i;
+
+	if (!is_nommu_shared_mapping(vma->vm_flags))
+		return -EINVAL;
+
+	guard(mutex)(&ctx->mmap_lock);
+	region = io_mmap_get_region(ctx, vma->vm_pgoff);
+	if (!region || !io_region_is_set(region))
+		return -EINVAL;
+
+	if ((vma->vm_end - vma->vm_start) !=
+	    (unsigned long) region->nr_pages << PAGE_SHIFT)
+		return -EINVAL;
+
+	/*
+	 * Pin the pages so io_free_region()'s release_pages() does not
+	 * drop the last reference while this VMA exists. delete_vma()
+	 * in mm/nommu.c calls vma_close() which runs ->close above.
+	 */
+	for (i = 0; i < region->nr_pages; i++)
+		get_page(region->pages[i]);
+
+	vma->vm_ops = &io_uring_nommu_vm_ops;
+	return 0;
 }

 unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -93,7 +93,7 @@ static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
 */
 static inline bool io_poll_get_ownership(struct io_kiocb *req)
 {
-	if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
+	if (unlikely((unsigned int)atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
 		return io_poll_get_ownership_slowpath(req);
 	return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
 }
@@ -417,8 +417,10 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		 * disable multishot as there is a circular dependency between
 		 * CQ posting and triggering the event.
 		 */
-		if (mask & EPOLL_URING_WAKE)
+		if (mask & EPOLL_URING_WAKE) {
 			poll->events |= EPOLLONESHOT;
+			req->apoll_events |= EPOLLONESHOT;
+		}

 		/* optional, saves extra locking for removal in tw handler */
 		if (mask && poll->events & EPOLLONESHOT) {
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -599,10 +599,20 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 	if (tail - old_head > p->sq_entries)
 		goto overflow;
 	for (i = old_head; i < tail; i++) {
-		unsigned src_head = i & (ctx->sq_entries - 1);
-		unsigned dst_head = i & (p->sq_entries - 1);
+		unsigned index, dst_mask, src_mask;
+		size_t sq_size;

-		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
+		index = i;
+		sq_size = sizeof(struct io_uring_sqe);
+		src_mask = ctx->sq_entries - 1;
+		dst_mask = p->sq_entries - 1;
+		if (ctx->flags & IORING_SETUP_SQE128) {
+			index <<= 1;
+			sq_size <<= 1;
+			src_mask = (ctx->sq_entries << 1) - 1;
+			dst_mask = (p->sq_entries << 1) - 1;
+		}
+		memcpy(&n.sq_sqes[index & dst_mask], &o.sq_sqes[index & src_mask], sq_size);
 	}
 	WRITE_ONCE(n.rings->sq.head, old_head);
 	WRITE_ONCE(n.rings->sq.tail, tail);
@@ -619,10 +629,20 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 		goto out;
 	}
 	for (i = old_head; i < tail; i++) {
-		unsigned src_head = i & (ctx->cq_entries - 1);
-		unsigned dst_head = i & (p->cq_entries - 1);
+		unsigned index, dst_mask, src_mask;
+		size_t cq_size;

-		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
+		index = i;
+		cq_size = sizeof(struct io_uring_cqe);
+		src_mask = ctx->cq_entries - 1;
+		dst_mask = p->cq_entries - 1;
+		if (ctx->flags & IORING_SETUP_CQE32) {
+			index <<= 1;
+			cq_size <<= 1;
+			src_mask = (ctx->cq_entries << 1) - 1;
+			dst_mask = (p->cq_entries << 1) - 1;
+		}
+		memcpy(&n.rings->cqes[index & dst_mask], &o.rings->cqes[index & src_mask], cq_size);
 	}
 	WRITE_ONCE(n.rings->cq.head, old_head);
 	WRITE_ONCE(n.rings->cq.tail, tail);
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -168,7 +168,7 @@ bool io_rsrc_cache_init(struct io_ring_ctx *ctx)
 void io_rsrc_cache_free(struct io_ring_ctx *ctx)
 {
 	io_alloc_cache_free(&ctx->node_cache, kfree);
-	io_alloc_cache_free(&ctx->imu_cache, kfree);
+	io_alloc_cache_free(&ctx->imu_cache, kvfree);
 }

 static void io_clear_table_tags(struct io_rsrc_data *data)
@@ -238,6 +238,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			continue;

 		i = up->offset + done;
+		if (i >= ctx->file_table.data.nr)
+			break;
+		i = array_index_nospec(i, ctx->file_table.data.nr);
 		if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
 			io_file_bitmap_clear(&ctx->file_table, i);

--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -109,10 +109,15 @@ static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node
 }

 static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx,
-				      struct io_rsrc_data *data, int index)
+				      struct io_rsrc_data *data,
+				      unsigned int index)
 {
-	struct io_rsrc_node *node = data->nodes[index];
+	struct io_rsrc_node *node;

+	if (index >= data->nr)
+		return false;
+	index = array_index_nospec(index, data->nr);
+	node = data->nodes[index];
 	if (!node)
 		return false;
 	io_put_rsrc_node(ctx, node);
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -580,7 +580,7 @@ void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw)
 	io_req_io_end(req);

 	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
-		req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL);
+		req->cqe.flags |= io_put_kbuf(req, max(req->cqe.res, 0), NULL);

 	io_req_rw_cleanup(req, 0);
 	io_req_task_complete(tw_req, tw);
@@ -1379,7 +1379,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		list_del(&req->iopoll_node);
 		wq_list_add_tail(&req->comp_list, &ctx->submit_state.compl_reqs);
 		nr_events++;
-		req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL);
+		req->cqe.flags = io_put_kbuf(req, max(req->cqe.res, 0), NULL);
 		if (!io_is_uring_cmd(req))
 			io_req_rw_cleanup(req, 0);
 	}
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -146,9 +146,13 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 		if (IS_ERR(tctx))
 			return PTR_ERR(tctx);

-		if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) {
-			unsigned int limits[2] = { ctx->iowq_limits[0],
-						   ctx->iowq_limits[1], };
+		if (data_race(ctx->int_flags) & IO_RING_F_IOWQ_LIMITS_SET) {
+			unsigned int limits[2];
+
+			mutex_lock(&ctx->uring_lock);
+			limits[0] = ctx->iowq_limits[0];
+			limits[1] = ctx->iowq_limits[1];
+			mutex_unlock(&ctx->uring_lock);

 			ret = io_wq_max_workers(tctx->io_wq, limits);
 			if (ret)
@@ -171,7 +175,10 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 	}
 	if (!current->io_uring) {
 err_free:
-		io_wq_put_and_exit(tctx->io_wq);
+		if (tctx->io_wq) {
+			io_wq_exit_start(tctx->io_wq);
+			io_wq_put_and_exit(tctx->io_wq);
+		}
 		percpu_counter_destroy(&tctx->inflight);
 		kfree(tctx);
 	}
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -396,6 +396,7 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
 	ifq->rq.ring = (struct io_uring *)ptr;
 	ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off);

+	memset(ifq->rq.ring, 0, sizeof(*ifq->rq.ring));
 	return 0;
 }

@@ -579,13 +580,13 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)

 	if (ifq->area)
 		io_zcrx_free_area(ifq, ifq->area);
-	free_uid(ifq->user);
 	if (ifq->mm_account)
 		mmdrop(ifq->mm_account);
 	if (ifq->dev)
 		put_device(ifq->dev);

 	io_free_rbuf_ring(ifq);
+	free_uid(ifq->user);
 	mutex_destroy(&ifq->pp_lock);
 	kfree(ifq);
 }
@@ -601,6 +602,8 @@ static void io_zcrx_return_niov_freelist(struct net_iov *niov)
 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);

 	guard(spinlock_bh)(&area->freelist_lock);
+	if (WARN_ON_ONCE(area->free_count >= area->nia.num_niovs))
+		return;
 	area->freelist[area->free_count++] = net_iov_idx(niov);
 }