mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-16 02:01:18 -04:00
Merge tag 'io_uring-7.1-20260424' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring fixes from Jens Axboe:
- Fix for a NOMMU bug with io_uring, where NOMMU doesn't grab page refs
at mmap time. NOMMU also has entirely broken FOLL_PIN support, yet
here we are
- A few fixes covering minor issues introduced in this merge window
- data race annotation to shut up KCSAN for when io-wq limits are
applied
- A nospec addition for direct descriptor file updating. Rest of the
direct descriptor path already had this, but for some reason the
update did not. Now they are all the same
- Various minor defensive changes that claude identified and suggested
terrible fixes for, turned into actually useful cleanups:
- Use kvfree() for the imu cache. These can come from kmalloc or
vmalloc depending on size, but the in-cache ones are capped
where it's always kmalloc based. Change to kvfree() in the
cleanup path, making future changes unlikely to mess that up
- Negative kbuf consumption lengths. Can't happen right now, but
cqe->res is used directly, which if other codes changes could
then be an error value
- Fix for an issue with the futex code, where partial wakes on a
vectored fuxes would potentially wake the same futex twice, rather
than move on to the next one. This could confuse an application as it
would've expected the next futex to have been woken
- Fix for a bug with ring resizing, where SQEs or CQEs might not have
been copied correctly if large SQEs or CQEs are used in the ring.
Application side issue, where SQEs or CQEs might have been lost
during resize
- Fix for a bug where EPOLL_URING_WAKE might have been lost, causing a
multishot poll to not be terminated when it's nested, like it should
have been
- Fix for an issue with signed comparison of poll references for the
slow path
- Fix for a user struct UAF in the zcrx code
- Two minor zcrx cleanups
* tag 'io_uring-7.1-20260424' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
io_uring: take page references for NOMMU pbuf_ring mmaps
io_uring/poll: ensure EPOLL_ONESHOT is propagated for EPOLL_URING_WAKE
io_uring/zcrx: warn on freelist violations
io_uring/zcrx: clear RQ headers on init
io_uring/zcrx: fix user_struct uaf
io_uring/register: fix ring resizing with mixed/large SQEs/CQEs
io_uring/futex: ensure partial wakes are appropriately dequeued
io_uring/rw: add defensive hardening for negative kbuf lengths
io_uring/rsrc: use kvfree() for the imu cache
io_uring/rsrc: unify nospec indexing for direct descriptors
io_uring: fix spurious fput in registered ring path
io_uring: fix iowq_limits data race in tctx node addition
io_uring/tctx: mark io_wq as exiting before error path teardown
io_uring/tctx: check for setup tctx->io_wq before teardown
io_uring/poll: fix signed comparison in io_poll_get_ownership()
This commit is contained in:
@@ -64,7 +64,7 @@ static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp)
|
||||
static inline void io_cache_free(struct io_alloc_cache *cache, void *obj)
|
||||
{
|
||||
if (!io_alloc_cache_put(cache, obj))
|
||||
kfree(obj);
|
||||
kvfree(obj);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -159,8 +159,10 @@ static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q)
|
||||
struct io_kiocb *req = q->wake_data;
|
||||
struct io_futexv_data *ifd = req->async_data;
|
||||
|
||||
if (!io_futexv_claim(ifd))
|
||||
if (!io_futexv_claim(ifd)) {
|
||||
__futex_wake_mark(q);
|
||||
return;
|
||||
}
|
||||
if (unlikely(!__futex_wake_mark(q)))
|
||||
return;
|
||||
|
||||
|
||||
@@ -2575,7 +2575,8 @@ struct file *io_uring_ctx_get_file(unsigned int fd, bool registered)
|
||||
return ERR_PTR(-EBADF);
|
||||
if (io_is_uring_fops(file))
|
||||
return file;
|
||||
fput(file);
|
||||
if (!registered)
|
||||
fput(file);
|
||||
return ERR_PTR(-EOPNOTSUPP);
|
||||
}
|
||||
|
||||
|
||||
@@ -366,9 +366,53 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
|
||||
|
||||
#else /* !CONFIG_MMU */
|
||||
|
||||
/*
|
||||
* Drop the pages that were initially referenced and added in
|
||||
* io_uring_mmap(). We cannot have had a mremap() as that isn't supported,
|
||||
* hence the vma should be identical to the one we initially referenced and
|
||||
* mapped, and partial unmaps and splitting isn't possible on a file backed
|
||||
* mapping.
|
||||
*/
|
||||
static void io_uring_nommu_vm_close(struct vm_area_struct *vma)
|
||||
{
|
||||
unsigned long index;
|
||||
|
||||
for (index = vma->vm_start; index < vma->vm_end; index += PAGE_SIZE)
|
||||
put_page(virt_to_page((void *) index));
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct io_uring_nommu_vm_ops = {
|
||||
.close = io_uring_nommu_vm_close,
|
||||
};
|
||||
|
||||
int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
{
|
||||
return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
|
||||
struct io_ring_ctx *ctx = file->private_data;
|
||||
struct io_mapped_region *region;
|
||||
unsigned long i;
|
||||
|
||||
if (!is_nommu_shared_mapping(vma->vm_flags))
|
||||
return -EINVAL;
|
||||
|
||||
guard(mutex)(&ctx->mmap_lock);
|
||||
region = io_mmap_get_region(ctx, vma->vm_pgoff);
|
||||
if (!region || !io_region_is_set(region))
|
||||
return -EINVAL;
|
||||
|
||||
if ((vma->vm_end - vma->vm_start) !=
|
||||
(unsigned long) region->nr_pages << PAGE_SHIFT)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Pin the pages so io_free_region()'s release_pages() does not
|
||||
* drop the last reference while this VMA exists. delete_vma()
|
||||
* in mm/nommu.c calls vma_close() which runs ->close above.
|
||||
*/
|
||||
for (i = 0; i < region->nr_pages; i++)
|
||||
get_page(region->pages[i]);
|
||||
|
||||
vma->vm_ops = &io_uring_nommu_vm_ops;
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
|
||||
|
||||
@@ -93,7 +93,7 @@ static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
|
||||
*/
|
||||
static inline bool io_poll_get_ownership(struct io_kiocb *req)
|
||||
{
|
||||
if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
|
||||
if (unlikely((unsigned int)atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
|
||||
return io_poll_get_ownership_slowpath(req);
|
||||
return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
|
||||
}
|
||||
@@ -417,8 +417,10 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
|
||||
* disable multishot as there is a circular dependency between
|
||||
* CQ posting and triggering the event.
|
||||
*/
|
||||
if (mask & EPOLL_URING_WAKE)
|
||||
if (mask & EPOLL_URING_WAKE) {
|
||||
poll->events |= EPOLLONESHOT;
|
||||
req->apoll_events |= EPOLLONESHOT;
|
||||
}
|
||||
|
||||
/* optional, saves extra locking for removal in tw handler */
|
||||
if (mask && poll->events & EPOLLONESHOT) {
|
||||
|
||||
@@ -599,10 +599,20 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
|
||||
if (tail - old_head > p->sq_entries)
|
||||
goto overflow;
|
||||
for (i = old_head; i < tail; i++) {
|
||||
unsigned src_head = i & (ctx->sq_entries - 1);
|
||||
unsigned dst_head = i & (p->sq_entries - 1);
|
||||
unsigned index, dst_mask, src_mask;
|
||||
size_t sq_size;
|
||||
|
||||
n.sq_sqes[dst_head] = o.sq_sqes[src_head];
|
||||
index = i;
|
||||
sq_size = sizeof(struct io_uring_sqe);
|
||||
src_mask = ctx->sq_entries - 1;
|
||||
dst_mask = p->sq_entries - 1;
|
||||
if (ctx->flags & IORING_SETUP_SQE128) {
|
||||
index <<= 1;
|
||||
sq_size <<= 1;
|
||||
src_mask = (ctx->sq_entries << 1) - 1;
|
||||
dst_mask = (p->sq_entries << 1) - 1;
|
||||
}
|
||||
memcpy(&n.sq_sqes[index & dst_mask], &o.sq_sqes[index & src_mask], sq_size);
|
||||
}
|
||||
WRITE_ONCE(n.rings->sq.head, old_head);
|
||||
WRITE_ONCE(n.rings->sq.tail, tail);
|
||||
@@ -619,10 +629,20 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
|
||||
goto out;
|
||||
}
|
||||
for (i = old_head; i < tail; i++) {
|
||||
unsigned src_head = i & (ctx->cq_entries - 1);
|
||||
unsigned dst_head = i & (p->cq_entries - 1);
|
||||
unsigned index, dst_mask, src_mask;
|
||||
size_t cq_size;
|
||||
|
||||
n.rings->cqes[dst_head] = o.rings->cqes[src_head];
|
||||
index = i;
|
||||
cq_size = sizeof(struct io_uring_cqe);
|
||||
src_mask = ctx->cq_entries - 1;
|
||||
dst_mask = p->cq_entries - 1;
|
||||
if (ctx->flags & IORING_SETUP_CQE32) {
|
||||
index <<= 1;
|
||||
cq_size <<= 1;
|
||||
src_mask = (ctx->cq_entries << 1) - 1;
|
||||
dst_mask = (p->cq_entries << 1) - 1;
|
||||
}
|
||||
memcpy(&n.rings->cqes[index & dst_mask], &o.rings->cqes[index & src_mask], cq_size);
|
||||
}
|
||||
WRITE_ONCE(n.rings->cq.head, old_head);
|
||||
WRITE_ONCE(n.rings->cq.tail, tail);
|
||||
|
||||
@@ -168,7 +168,7 @@ bool io_rsrc_cache_init(struct io_ring_ctx *ctx)
|
||||
void io_rsrc_cache_free(struct io_ring_ctx *ctx)
|
||||
{
|
||||
io_alloc_cache_free(&ctx->node_cache, kfree);
|
||||
io_alloc_cache_free(&ctx->imu_cache, kfree);
|
||||
io_alloc_cache_free(&ctx->imu_cache, kvfree);
|
||||
}
|
||||
|
||||
static void io_clear_table_tags(struct io_rsrc_data *data)
|
||||
@@ -238,6 +238,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
|
||||
continue;
|
||||
|
||||
i = up->offset + done;
|
||||
if (i >= ctx->file_table.data.nr)
|
||||
break;
|
||||
i = array_index_nospec(i, ctx->file_table.data.nr);
|
||||
if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
|
||||
io_file_bitmap_clear(&ctx->file_table, i);
|
||||
|
||||
|
||||
@@ -109,10 +109,15 @@ static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node
|
||||
}
|
||||
|
||||
static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx,
|
||||
struct io_rsrc_data *data, int index)
|
||||
struct io_rsrc_data *data,
|
||||
unsigned int index)
|
||||
{
|
||||
struct io_rsrc_node *node = data->nodes[index];
|
||||
struct io_rsrc_node *node;
|
||||
|
||||
if (index >= data->nr)
|
||||
return false;
|
||||
index = array_index_nospec(index, data->nr);
|
||||
node = data->nodes[index];
|
||||
if (!node)
|
||||
return false;
|
||||
io_put_rsrc_node(ctx, node);
|
||||
|
||||
@@ -580,7 +580,7 @@ void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw)
|
||||
io_req_io_end(req);
|
||||
|
||||
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
|
||||
req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL);
|
||||
req->cqe.flags |= io_put_kbuf(req, max(req->cqe.res, 0), NULL);
|
||||
|
||||
io_req_rw_cleanup(req, 0);
|
||||
io_req_task_complete(tw_req, tw);
|
||||
@@ -1379,7 +1379,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
|
||||
list_del(&req->iopoll_node);
|
||||
wq_list_add_tail(&req->comp_list, &ctx->submit_state.compl_reqs);
|
||||
nr_events++;
|
||||
req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL);
|
||||
req->cqe.flags = io_put_kbuf(req, max(req->cqe.res, 0), NULL);
|
||||
if (!io_is_uring_cmd(req))
|
||||
io_req_rw_cleanup(req, 0);
|
||||
}
|
||||
|
||||
@@ -146,9 +146,13 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
|
||||
if (IS_ERR(tctx))
|
||||
return PTR_ERR(tctx);
|
||||
|
||||
if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) {
|
||||
unsigned int limits[2] = { ctx->iowq_limits[0],
|
||||
ctx->iowq_limits[1], };
|
||||
if (data_race(ctx->int_flags) & IO_RING_F_IOWQ_LIMITS_SET) {
|
||||
unsigned int limits[2];
|
||||
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
limits[0] = ctx->iowq_limits[0];
|
||||
limits[1] = ctx->iowq_limits[1];
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
|
||||
ret = io_wq_max_workers(tctx->io_wq, limits);
|
||||
if (ret)
|
||||
@@ -171,7 +175,10 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
|
||||
}
|
||||
if (!current->io_uring) {
|
||||
err_free:
|
||||
io_wq_put_and_exit(tctx->io_wq);
|
||||
if (tctx->io_wq) {
|
||||
io_wq_exit_start(tctx->io_wq);
|
||||
io_wq_put_and_exit(tctx->io_wq);
|
||||
}
|
||||
percpu_counter_destroy(&tctx->inflight);
|
||||
kfree(tctx);
|
||||
}
|
||||
|
||||
@@ -396,6 +396,7 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
|
||||
ifq->rq.ring = (struct io_uring *)ptr;
|
||||
ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
|
||||
|
||||
memset(ifq->rq.ring, 0, sizeof(*ifq->rq.ring));
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -579,13 +580,13 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
|
||||
|
||||
if (ifq->area)
|
||||
io_zcrx_free_area(ifq, ifq->area);
|
||||
free_uid(ifq->user);
|
||||
if (ifq->mm_account)
|
||||
mmdrop(ifq->mm_account);
|
||||
if (ifq->dev)
|
||||
put_device(ifq->dev);
|
||||
|
||||
io_free_rbuf_ring(ifq);
|
||||
free_uid(ifq->user);
|
||||
mutex_destroy(&ifq->pp_lock);
|
||||
kfree(ifq);
|
||||
}
|
||||
@@ -601,6 +602,8 @@ static void io_zcrx_return_niov_freelist(struct net_iov *niov)
|
||||
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
|
||||
|
||||
guard(spinlock_bh)(&area->freelist_lock);
|
||||
if (WARN_ON_ONCE(area->free_count >= area->nia.num_niovs))
|
||||
return;
|
||||
area->freelist[area->free_count++] = net_iov_idx(niov);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user