From 7cb3a68376da0bc0afab8157223cb479c97de9ff Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 6 Feb 2026 14:58:04 -0700 Subject: [PATCH 01/19] io_uring: simplify IORING_SETUP_DEFER_TASKRUN && !SQPOLL check io_uring_sanitise_params() already rejects flags that include both IORING_SETUP_SQPOLL and IORING_SETUP_DEFER_TASKRUN. So it's unnecessary to check IORING_SETUP_SQPOLL in io_uring_create() when IORING_SETUP_DEFER_TASKRUN has already been checked. Drop the !(ctx->flags & IORING_SETUP_SQPOLL) check for the task_complete case. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1d5bc669afd9..3a7be1695c39 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2949,8 +2949,7 @@ static __cold int io_uring_create(struct io_ctx_config *config) static_branch_inc(&io_key_has_sqarray); if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && - !(ctx->flags & IORING_SETUP_IOPOLL) && - !(ctx->flags & IORING_SETUP_SQPOLL)) + !(ctx->flags & IORING_SETUP_IOPOLL)) ctx->task_complete = true; if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) From daa0b901f8319414cf9f56237f15240b95e4b1b2 Mon Sep 17 00:00:00 2001 From: Yang Xiuwei Date: Tue, 10 Feb 2026 10:34:32 +0800 Subject: [PATCH 02/19] io_uring/tctx: avoid modifying loop variable in io_ring_add_registered_file Use a separate 'idx' variable to store the result of array_index_nospec() instead of modifying the loop variable 'offset' directly. This improves code clarity by separating the logical index from the sanitized index used for array access. No functional change intended. Signed-off-by: Yang Xiuwei Signed-off-by: Jens Axboe --- io_uring/tctx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index ad9e4336d736..270263699c6f 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -240,14 +240,14 @@ void io_uring_unreg_ringfd(void) int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end) { - int offset; + int offset, idx; for (offset = start; offset < end; offset++) { - offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); - if (tctx->registered_rings[offset]) + idx = array_index_nospec(offset, IO_RINGFD_REG_MAX); + if (tctx->registered_rings[idx]) continue; - tctx->registered_rings[offset] = file; - return offset; + tctx->registered_rings[idx] = file; + return idx; } return -EBUSY; } From 417d029dc412c1028bce3d4685700332c0539a95 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 5 Feb 2026 18:04:43 +0000 Subject: [PATCH 03/19] io_uring/zcrx: improve types for size calculation Make sure io_import_umem() promotes the type to long before calculating the area size. While the area size is capped at 1GB by io_validate_user_buf_range() and fits into an "int", it's still too error prone. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 3d398283cf34..69567e19b4ca 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -193,7 +193,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, return PTR_ERR(pages); ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages, - 0, nr_pages << PAGE_SHIFT, + 0, (unsigned long)nr_pages << PAGE_SHIFT, GFP_KERNEL_ACCOUNT); if (ret) { unpin_user_pages(pages, nr_pages); From 0efc331d78b043b9d8477c64e279058062d36a0b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Feb 2026 14:31:22 +0000 Subject: [PATCH 04/19] io_uring/rsrc: replace reg buffer bit field with flags I'll need a flag in the registered buffer struct for dmabuf work, and it'll be more convenient to have a flags field rather than bit fields, especially for io_mapped_ubuf initialisation. We might want to add more flags in the future as well. For example, it might be useful for debugging and potentially optimisations to split out a flag indicating the shape of the buffer to gate iov_iter_advance() walks vs bit/mask arithmetics. It can also be combined with the direction mask field. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 12 ++++++------ io_uring/rsrc.h | 6 +++++- io_uring/rw.c | 3 ++- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 95ce553fff8d..05f00bdb02d7 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -828,7 +828,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, imu->folio_shift = PAGE_SHIFT; imu->release = io_release_ubuf; imu->priv = imu; - imu->is_kbuf = false; + imu->flags = 0; imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; if (coalesced) imu->folio_shift = data.folio_shift; @@ -985,7 +985,7 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, refcount_set(&imu->refs, 1); imu->release = release; imu->priv = rq; - imu->is_kbuf = true; + imu->flags = IO_REGBUF_F_KBUF; imu->dir = 1 << rq_data_dir(rq); rq_for_each_bvec(bv, rq, rq_iter) @@ -1020,7 +1020,7 @@ int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, ret = -EINVAL; goto unlock; } - if (!node->buf->is_kbuf) { + if (!(node->buf->flags & IO_REGBUF_F_KBUF)) { ret = -EBUSY; goto unlock; } @@ -1076,7 +1076,7 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, offset = buf_addr - imu->ubuf; - if (imu->is_kbuf) + if (imu->flags & IO_REGBUF_F_KBUF) return io_import_kbuf(ddir, iter, imu, len, offset); /* @@ -1496,7 +1496,7 @@ int io_import_reg_vec(int ddir, struct iov_iter *iter, iovec_off = vec->nr - nr_iovs; iov = vec->iovec + iovec_off; - if (imu->is_kbuf) { + if (imu->flags & IO_REGBUF_F_KBUF) { int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); if (unlikely(ret)) @@ -1534,7 +1534,7 @@ int io_import_reg_vec(int ddir, struct iov_iter *iter, req->flags |= REQ_F_NEED_CLEANUP; } - if (imu->is_kbuf) + if (imu->flags & IO_REGBUF_F_KBUF) return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 4a5db2ad1af2..cff0f8834c35 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -28,6 +28,10 @@ enum { IO_IMU_SOURCE = 1 << ITER_SOURCE, }; +enum { + IO_REGBUF_F_KBUF = 1, +}; + struct io_mapped_ubuf { u64 ubuf; unsigned int len; @@ -37,7 +41,7 @@ struct io_mapped_ubuf { unsigned long acct_pages; void (*release)(void *); void *priv; - bool is_kbuf; + u8 flags; u8 dir; struct bio_vec bvec[] __counted_by(nr_bvecs); }; diff --git a/io_uring/rw.c b/io_uring/rw.c index d10386f56d49..b3971171c342 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -702,7 +702,8 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) if ((kiocb->ki_flags & IOCB_NOWAIT) && !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; - if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf) + if ((req->flags & REQ_F_BUF_NODE) && + (req->buf_node->buf->flags & IO_REGBUF_F_KBUF)) return -EFAULT; ppos = io_kiocb_ppos(kiocb); From a6bded921ed35f21b3f6bd8e629bf488499ca442 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Feb 2026 15:12:03 -0700 Subject: [PATCH 05/19] io_uring/filetable: clamp alloc_hint to the configured alloc range Explicit fixed file install/remove operations on slots outside the configured alloc range can corrupt alloc_hint via io_file_bitmap_set() and io_file_bitmap_clear(), which unconditionally update alloc_hint to the bit position. This causes subsequent auto-allocations to fall outside the configured range. For example, if the alloc range is [10, 20) and a file is removed at slot 2, alloc_hint gets set to 2. The next auto-alloc then starts searching from slot 2, potentially returning a slot below the range. Fix this by clamping alloc_hint to [file_alloc_start, file_alloc_end) at the top of io_file_bitmap_get() before starting the search. Cc: stable@vger.kernel.org Fixes: 6e73dffbb93c ("io_uring: let to set a range for file slot allocation") Signed-off-by: Jens Axboe --- io_uring/filetable.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 794ef95df293..cb1838c9fc37 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -22,6 +22,10 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx) if (!table->bitmap) return -ENFILE; + if (table->alloc_hint < ctx->file_alloc_start || + table->alloc_hint >= ctx->file_alloc_end) + table->alloc_hint = ctx->file_alloc_start; + do { ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); if (ret != nr) From f4d0668b38d8784f33a9a36c72ed5d0078247538 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Feb 2026 15:12:13 -0700 Subject: [PATCH 06/19] io_uring/openclose: fix io_pipe_fixed() slot tracking for specific slots __io_fixed_fd_install() returns 0 on success for non-alloc mode (specific slot), not the slot index. io_pipe_fixed() used this return value directly as the slot index in fds[], which can cause the reported values returned via copy_to_user() to be incorrect, or the error path operating on the incorrect direct descriptor. Fix by computing the actual 0-based slot index (slot - 1) for specific slot mode, while preserving the existing behavior for auto-alloc mode where __io_fixed_fd_install() already returns the allocated index. Cc: stable@vger.kernel.org Fixes: 53db8a71ecb4 ("io_uring: add support for IORING_OP_PIPE") Signed-off-by: Jens Axboe --- io_uring/openclose.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/io_uring/openclose.c b/io_uring/openclose.c index d617b421b1e6..c71242915dad 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -345,31 +345,34 @@ static int io_pipe_fixed(struct io_kiocb *req, struct file **files, { struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); struct io_ring_ctx *ctx = req->ctx; + bool alloc_slot; int ret, fds[2] = { -1, -1 }; int slot = p->file_slot; if (p->flags & O_CLOEXEC) return -EINVAL; + alloc_slot = slot == IORING_FILE_INDEX_ALLOC; + io_ring_submit_lock(ctx, issue_flags); ret = __io_fixed_fd_install(ctx, files[0], slot); if (ret < 0) goto err; - fds[0] = ret; + fds[0] = alloc_slot ? ret : slot - 1; files[0] = NULL; /* * If a specific slot is given, next one will be used for * the write side. */ - if (slot != IORING_FILE_INDEX_ALLOC) + if (!alloc_slot) slot++; ret = __io_fixed_fd_install(ctx, files[1], slot); if (ret < 0) goto err; - fds[1] = ret; + fds[1] = alloc_slot ? ret : slot - 1; files[1] = NULL; io_ring_submit_unlock(ctx, issue_flags); From d7d95207caf41a8b5a595ed257cb4ce69726d3d7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Feb 2026 19:47:38 -0700 Subject: [PATCH 07/19] io_uring: use the right type for creds iteration In io_ring_ctx_wait_and_kill(), struct creds *creds is used to iterate and prune credentials. But the correct type is struct cred. This doesn't matter as the variable isn't used at all, only the index is used. But it's confusing using a type that isn't valid, so fix it up. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3a7be1695c39..c45af82dda3d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2398,7 +2398,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; - struct creds *creds; + struct cred *creds; mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); From a983aae397767e9da931128ff2b5bf9066513ce3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 14 Feb 2026 22:19:32 +0000 Subject: [PATCH 08/19] io_uring/zcrx: fix sgtable leak on mapping failures In an unlikely case when io_populate_area_dma() fails, which could only happen on a PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA machine, io_zcrx_map_area() will have an initialised and not freed table. It was supposed to be cleaned up in the error path, but !is_mapped prevents that. Fixes: 439a98b972fbb ("io_uring/zcrx: deduplicate area mapping") Cc: stable@vger.kernel.org Reported-by: Jens Axboe Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 69567e19b4ca..006e1bfefa5f 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -288,6 +288,9 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) } ret = io_populate_area_dma(ifq, area); + if (ret && !area->mem.is_dmabuf) + dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); if (ret == 0) area->is_mapped = true; return ret; From 5d540e4508950c674d6feef1d95463d039bbf4f5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 14 Feb 2026 22:20:47 +0000 Subject: [PATCH 09/19] io_uring/zcrx: fix post open error handling Closing a queue doesn't guarantee that all associated page pools are terminated right away, let the refcounting do the work instead of releasing the zcrx ctx directly. Cc: stable@vger.kernel.org Fixes: e0793de24a9f6 ("io_uring/zcrx: set pp memory provider for an rx queue") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 006e1bfefa5f..b24d1da2e1ca 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -515,9 +515,6 @@ static void io_close_queue(struct io_zcrx_ifq *ifq) .mp_priv = ifq, }; - if (ifq->if_rxq == -1) - return; - scoped_guard(mutex, &ifq->pp_lock) { netdev = ifq->netdev; netdev_tracker = ifq->netdev_tracker; @@ -525,7 +522,8 @@ static void io_close_queue(struct io_zcrx_ifq *ifq) } if (netdev) { - net_mp_close_rxq(netdev, ifq->if_rxq, &p); + if (ifq->if_rxq != -1) + net_mp_close_rxq(netdev, ifq->if_rxq, &p); netdev_put(netdev, &netdev_tracker); } ifq->if_rxq = -1; @@ -833,13 +831,12 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, } return 0; netdev_put_unlock: - netdev_put(ifq->netdev, &ifq->netdev_tracker); netdev_unlock(ifq->netdev); err: scoped_guard(mutex, &ctx->mmap_lock) xa_erase(&ctx->zcrx_ctxs, id); ifq_free: - io_zcrx_ifq_free(ifq); + zcrx_unregister(ifq); return ret; } From 7496e658a76a61758b20e27cea8abcfeafe3aec4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Feb 2026 21:29:12 +0000 Subject: [PATCH 10/19] io_uring/zcrx: check unsupported flags on import The imoorted zcrx registration path checks for ZCRX_REG_IMPORT, as it should, but doesn't reject any unsupported flags. Fix that. Cc: stable@vger.kernel.org Fixes: 00d91481279fb ("io_uring/zcrx: share an ifq between rings") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index b24d1da2e1ca..dda863e5e8b7 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -677,6 +677,8 @@ static int import_zcrx(struct io_ring_ctx *ctx, return -EINVAL; if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) return -EINVAL; + if (reg->flags & ~ZCRX_REG_IMPORT) + return -EINVAL; fd = reg->if_idx; CLASS(fd, f)(fd); From c29214677a9fc1a3a4ee65e189afeb5fd10d676f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Feb 2026 21:34:28 +0000 Subject: [PATCH 11/19] io_uring/query: return support for custom rx page size Add an ability to query if the zcrx rx page size setting is available. Note, even when the API is supported by io_uring, the registration can still get rejected for various reasons, e.g. when the NIC or the driver doesn't support it, when the particular specified size is unsupported, when the memory area doesn't satisfy all requirements, etc. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 8 ++++++++ include/uapi/linux/io_uring/query.h | 3 ++- io_uring/query.c | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index da5156954731..c462bdf3c42c 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1090,6 +1090,14 @@ enum zcrx_reg_flags { ZCRX_REG_IMPORT = 1, }; +enum zcrx_features { + /* + * The user can ask for the desired rx page size by passing the + * value in struct io_uring_zcrx_ifq_reg::rx_buf_len. + */ + ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0, +}; + /* * Argument for IORING_REGISTER_ZCRX_IFQ */ diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h index 2456e6c5ebb5..0b6248175e26 100644 --- a/include/uapi/linux/io_uring/query.h +++ b/include/uapi/linux/io_uring/query.h @@ -50,7 +50,8 @@ struct io_uring_query_zcrx { __u64 area_flags; /* The number of supported ZCRX_CTRL_* opcodes */ __u32 nr_ctrl_opcodes; - __u32 __resv1; + /* Bitmask of ZCRX_FEATURE_* indicating which features are available */ + __u32 features; /* The refill ring header size */ __u32 rq_hdr_size; /* The alignment for the header */ diff --git a/io_uring/query.c b/io_uring/query.c index abdd6f3e1223..63cc30c9803d 100644 --- a/io_uring/query.c +++ b/io_uring/query.c @@ -39,7 +39,7 @@ static ssize_t io_query_zcrx(union io_query_data *data) e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST; e->rq_hdr_size = sizeof(struct io_uring); e->rq_hdr_alignment = L1_CACHE_BYTES; - e->__resv1 = 0; + e->features = ZCRX_FEATURE_RX_PAGE_SIZE; e->__resv2 = 0; return sizeof(*e); } From 6b34f8edf8b807b7f87901623aa52dfa1b29ef93 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Feb 2026 21:38:09 +0000 Subject: [PATCH 12/19] io_uring/query: add query.h copyright notice Add a copyright notice to io_uring's query uapi header. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/query.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h index 0b6248175e26..95500759cc13 100644 --- a/include/uapi/linux/io_uring/query.h +++ b/include/uapi/linux/io_uring/query.h @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ /* * Header file for the io_uring query interface. + * + * Copyright (C) 2026 Pavel Begunkov + * Copyright (C) Meta Platforms, Inc. */ #ifndef LINUX_IO_URING_QUERY_H #define LINUX_IO_URING_QUERY_H From 56112578c71213a10c995a56835bddb5e9ab1ed0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Feb 2026 22:06:52 +0000 Subject: [PATCH 13/19] io_uring: delay sqarray static branch disablement io_key_has_sqarray static branch can be easily switched on/off by the user every time patching the kernel. That can be very disruptive as it might require heavy synchronisation across all CPUs. Use deferred static keys, which can rate-limit it by deferring, batching and potentially effectively eliminating dec+inc pairs. Fixes: 9b296c625ac1d ("io_uring: static_key for !IORING_SETUP_NO_SQARRAY") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c45af82dda3d..ccab8562d273 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -119,7 +119,7 @@ static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); static void __io_req_caches_free(struct io_ring_ctx *ctx); -static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); +static __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(io_key_has_sqarray, HZ); struct kmem_cache *req_cachep; static struct workqueue_struct *iou_wq __ro_after_init; @@ -1978,7 +1978,7 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) unsigned mask = ctx->sq_entries - 1; unsigned head = ctx->cached_sq_head++ & mask; - if (static_branch_unlikely(&io_key_has_sqarray) && + if (static_branch_unlikely(&io_key_has_sqarray.key) && (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { @@ -2173,7 +2173,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_rings_free(ctx); if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - static_branch_dec(&io_key_has_sqarray); + static_branch_slow_dec_deferred(&io_key_has_sqarray); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); @@ -2946,7 +2946,7 @@ static __cold int io_uring_create(struct io_ctx_config *config) ctx->clock_offset = 0; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - static_branch_inc(&io_key_has_sqarray); + static_branch_deferred_inc(&io_key_has_sqarray); if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL)) From 600b665b903733bd60334e86031b157cc823ee55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Mon, 16 Feb 2026 10:27:18 +0000 Subject: [PATCH 14/19] io_uring/cmd_net: fix too strict requirement on ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Attempting SOCKET_URING_OP_SETSOCKOPT on an AF_NETLINK socket resulted in an -EOPNOTSUPP, as AF_NETLINK doesn't have an ioctl in its struct proto, but only in struct proto_ops. Prior to the blamed commit, io_uring_cmd_sock() only had two cmd_op operations, both requiring ioctl, thus the check was warranted. Since then, 4 new cmd_op operations have been added, none of which depend on ioctl. This patch moves the ioctl check, so it only applies to the original operations. AFAICT, the ioctl requirement was unintentional, and it wasn't visible in the blamed patch within 3 lines of context. Cc: stable@vger.kernel.org Fixes: a5d2f99aff6b ("io_uring/cmd: Introduce SOCKET_URING_OP_GETSOCKOPT") Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/cmd_net.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index cb2775936fb8..57ddaf874611 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -160,16 +160,19 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) struct proto *prot = READ_ONCE(sk->sk_prot); int ret, arg = 0; - if (!prot || !prot->ioctl) - return -EOPNOTSUPP; - switch (cmd->cmd_op) { case SOCKET_URING_OP_SIOCINQ: + if (!prot || !prot->ioctl) + return -EOPNOTSUPP; + ret = prot->ioctl(sk, SIOCINQ, &arg); if (ret) return ret; return arg; case SOCKET_URING_OP_SIOCOUTQ: + if (!prot || !prot->ioctl) + return -EOPNOTSUPP; + ret = prot->ioctl(sk, SIOCOUTQ, &arg); if (ret) return ret; From 046fcc83ac1ba8747f0bcae13f5e433802735245 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Sun, 15 Feb 2026 23:15:23 +0000 Subject: [PATCH 15/19] io_uring: remove unneeded io_send_zc accounting zc->len and zc->buf are not actually used once you get to the retry stage. The buffer remains in kmsg->msg.msg_iter, which is setup in io_send_setup. Note: it still seems needed in io_send due to io_send_select_buffer needing it (for the len parameter). Signed-off-by: Dylan Yudaken Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/net.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index a6f3cbb7dfea..8576c6cb2236 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1493,8 +1493,6 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { - zc->len -= ret; - zc->buf += ret; zc->done_io += ret; return -EAGAIN; } From 2e02f9efdbc6c73544e315b7eb85e55a59776b6f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 16 Feb 2026 13:55:30 +0000 Subject: [PATCH 16/19] io_uring/rsrc: improve regbuf iov validation Deduplicate io_buffer_validate() calls by moving the checks into io_sqe_buffer_register(). Now we also don't need special handling in io_buffer_validate() passing through buffer removal requests. I also was using it as a cleanup before some other changes. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 05f00bdb02d7..842e231c8a7c 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -96,20 +96,6 @@ int io_validate_user_buf_range(u64 uaddr, u64 ulen) return 0; } -static int io_buffer_validate(struct iovec *iov) -{ - /* - * Don't impose further limits on the size and buffer - * constraints here, we'll -EINVAL later when IO is - * submitted if they are wrong. - */ - if (!iov->iov_base) - return iov->iov_len ? -EFAULT : 0; - - return io_validate_user_buf_range((unsigned long)iov->iov_base, - iov->iov_len); -} - static void io_release_ubuf(void *priv) { struct io_mapped_ubuf *imu = priv; @@ -319,9 +305,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, err = -EFAULT; break; } - err = io_buffer_validate(iov); - if (err) - break; node = io_sqe_buffer_register(ctx, iov, &last_hpage); if (IS_ERR(node)) { err = PTR_ERR(node); @@ -790,8 +773,17 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, struct io_imu_folio_data data; bool coalesced = false; - if (!iov->iov_base) + if (!iov->iov_base) { + if (iov->iov_len) + return ERR_PTR(-EFAULT); + /* remove the buffer without installing a new one */ return NULL; + } + + ret = io_validate_user_buf_range((unsigned long)iov->iov_base, + iov->iov_len); + if (ret) + return ERR_PTR(ret); node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!node) @@ -897,9 +889,6 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, ret = PTR_ERR(iov); break; } - ret = io_buffer_validate(iov); - if (ret) - break; if (ctx->compat) arg += sizeof(struct compat_iovec); else From 22dbb0987bd1e0ec3b1e4ad20756a98f99aa4a08 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 16 Feb 2026 14:16:27 -0700 Subject: [PATCH 17/19] io_uring/cancel: de-unionize file and user_data in struct io_cancel_data By having them share the same space in struct io_cancel_data, it ends up disallowing IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_USERDATA from working. Eg you cannot match on both a file and user_data for cancelation purposes. This obviously isn't a common use case as nobody has reported this, but it does result in -ENOENT potentially being returned when trying to match on both, rather than actually doing what the API says it would. Fixes: 4bf94615b888 ("io_uring: allow IORING_OP_ASYNC_CANCEL with 'fd' key") Signed-off-by: Jens Axboe --- io_uring/cancel.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 6783961ede1b..1b201a094303 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -6,10 +6,8 @@ struct io_cancel_data { struct io_ring_ctx *ctx; - union { - u64 data; - struct file *file; - }; + u64 data; + struct file *file; u8 opcode; u32 flags; int seq; From d21c362182aff7b4d994e35ceb04b6ed2da141d9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 10 Feb 2026 09:29:59 -0700 Subject: [PATCH 18/19] io_uring/bpf_filter: move filter size and populate helper into struct Rather than open-code this logic in io_uring_populate_bpf_ctx() with a switch, move it to the issue side definitions. Outside of making this easier to extend in the future, it's also a prep patch for using the pdu size for a given opcode filter elsewhere. Signed-off-by: Jens Axboe --- io_uring/bpf_filter.c | 17 ++++++----------- io_uring/opdef.c | 6 ++++++ io_uring/opdef.h | 6 ++++++ 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c index 3816883a45ed..8ac7d06de122 100644 --- a/io_uring/bpf_filter.c +++ b/io_uring/bpf_filter.c @@ -26,6 +26,8 @@ static const struct io_bpf_filter dummy_filter; static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req) { + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + bctx->opcode = req->opcode; bctx->sqe_flags = (__force int) req->flags & SQE_VALID_FLAGS; bctx->user_data = req->cqe.user_data; @@ -34,19 +36,12 @@ static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx, sizeof(*bctx) - offsetof(struct io_uring_bpf_ctx, pdu_size)); /* - * Opcodes can provide a handler fo populating more data into bctx, + * Opcodes can provide a handler for populating more data into bctx, * for filters to use. */ - switch (req->opcode) { - case IORING_OP_SOCKET: - bctx->pdu_size = sizeof(bctx->socket); - io_socket_bpf_populate(bctx, req); - break; - case IORING_OP_OPENAT: - case IORING_OP_OPENAT2: - bctx->pdu_size = sizeof(bctx->open); - io_openat_bpf_populate(bctx, req); - break; + if (def->filter_pdu_size) { + bctx->pdu_size = def->filter_pdu_size; + def->filter_populate(bctx, req); } } diff --git a/io_uring/opdef.c b/io_uring/opdef.c index df52d760240e..91a23baf415e 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -221,8 +221,10 @@ const struct io_issue_def io_issue_defs[] = { .issue = io_fallocate, }, [IORING_OP_OPENAT] = { + .filter_pdu_size = sizeof_field(struct io_uring_bpf_ctx, open), .prep = io_openat_prep, .issue = io_openat, + .filter_populate = io_openat_bpf_populate, }, [IORING_OP_CLOSE] = { .prep = io_close_prep, @@ -309,8 +311,10 @@ const struct io_issue_def io_issue_defs[] = { #endif }, [IORING_OP_OPENAT2] = { + .filter_pdu_size = sizeof_field(struct io_uring_bpf_ctx, open), .prep = io_openat2_prep, .issue = io_openat2, + .filter_populate = io_openat_bpf_populate, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, @@ -406,8 +410,10 @@ const struct io_issue_def io_issue_defs[] = { [IORING_OP_SOCKET] = { .audit_skip = 1, #if defined(CONFIG_NET) + .filter_pdu_size = sizeof_field(struct io_uring_bpf_ctx, socket), .prep = io_socket_prep, .issue = io_socket, + .filter_populate = io_socket_bpf_populate, #else .prep = io_eopnotsupp_prep, #endif diff --git a/io_uring/opdef.h b/io_uring/opdef.h index aa37846880ff..faf3955dce8b 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -2,6 +2,8 @@ #ifndef IOU_OP_DEF_H #define IOU_OP_DEF_H +struct io_uring_bpf_ctx; + struct io_issue_def { /* needs req->file assigned */ unsigned needs_file : 1; @@ -33,8 +35,12 @@ struct io_issue_def { /* size of async data needed, if any */ unsigned short async_size; + /* bpf filter pdu size, if any */ + unsigned short filter_pdu_size; + int (*issue)(struct io_kiocb *, unsigned int); int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); + void (*filter_populate)(struct io_uring_bpf_ctx *, struct io_kiocb *); }; struct io_cold_def { From be3573124e630736d2d39650b12f5ef220b47ac1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 10 Feb 2026 10:00:44 -0700 Subject: [PATCH 19/19] io_uring/bpf_filter: pass in expected filter payload size It's quite possible that opcodes that have payloads attached to them, like IORING_OP_OPENAT/OPENAT2 or IORING_OP_SOCKET, that these paylods can change over time. For example, on the openat/openat2 side, the struct open_how argument is extensible, and could be extended in the future to allow further arguments to be passed in. Allow registration of a cBPF filter to give the size of the filter as seen by userspace. If that filter is for an opcode that takes extra payload data, allow it if the application payload expectation is the same size than the kernels. If that is the case, the kernel supports filtering on the payload that the application expects. If the size differs, the behavior depends on the IO_URING_BPF_FILTER_SZ_STRICT flag: 1) If IO_URING_BPF_FILTER_SZ_STRICT is set and the size expectation differs, fail the attempt to load the filter. 2) If IO_URING_BPF_FILTER_SZ_STRICT isn't set, allow the filter if the userspace pdu size is smaller than what the kernel offers. 3) Regardless if IO_URING_BPF_FILTER_SZ_STRICT, fail loading the filter if the userspace pdu size is bigger than what the kernel supports. An attempt to load a filter due to sizing will error with -EMSGSIZE. For that error, the registration struct will have filter->pdu_size populated with the pdu size that the kernel uses. Reported-by: Christian Brauner Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/bpf_filter.h | 8 ++- io_uring/bpf_filter.c | 69 +++++++++++++++++------- 2 files changed, 58 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/io_uring/bpf_filter.h b/include/uapi/linux/io_uring/bpf_filter.h index 220351b81bc0..1b461d792a7b 100644 --- a/include/uapi/linux/io_uring/bpf_filter.h +++ b/include/uapi/linux/io_uring/bpf_filter.h @@ -35,13 +35,19 @@ enum { * If set, any currently unset opcode will have a deny filter attached */ IO_URING_BPF_FILTER_DENY_REST = 1, + /* + * If set, if kernel and application don't agree on pdu_size for + * the given opcode, fail the registration of the filter. + */ + IO_URING_BPF_FILTER_SZ_STRICT = 2, }; struct io_uring_bpf_filter { __u32 opcode; /* io_uring opcode to filter */ __u32 flags; __u32 filter_len; /* number of BPF instructions */ - __u32 resv; + __u8 pdu_size; /* expected pdu size for opcode */ + __u8 resv[3]; __u64 filter_ptr; /* pointer to BPF filter */ __u64 resv2[5]; }; diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c index 8ac7d06de122..28a23e92ee81 100644 --- a/io_uring/bpf_filter.c +++ b/io_uring/bpf_filter.c @@ -308,7 +308,54 @@ static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src) return ERR_PTR(-EBUSY); } -#define IO_URING_BPF_FILTER_FLAGS IO_URING_BPF_FILTER_DENY_REST +#define IO_URING_BPF_FILTER_FLAGS (IO_URING_BPF_FILTER_DENY_REST | \ + IO_URING_BPF_FILTER_SZ_STRICT) + +static int io_bpf_filter_import(struct io_uring_bpf *reg, + struct io_uring_bpf __user *arg) +{ + const struct io_issue_def *def; + int ret; + + if (copy_from_user(reg, arg, sizeof(*reg))) + return -EFAULT; + if (reg->cmd_type != IO_URING_BPF_CMD_FILTER) + return -EINVAL; + if (reg->cmd_flags || reg->resv) + return -EINVAL; + + if (reg->filter.opcode >= IORING_OP_LAST) + return -EINVAL; + if (reg->filter.flags & ~IO_URING_BPF_FILTER_FLAGS) + return -EINVAL; + if (!mem_is_zero(reg->filter.resv, sizeof(reg->filter.resv))) + return -EINVAL; + if (!mem_is_zero(reg->filter.resv2, sizeof(reg->filter.resv2))) + return -EINVAL; + if (!reg->filter.filter_len || reg->filter.filter_len > BPF_MAXINSNS) + return -EINVAL; + + /* Verify filter size */ + def = &io_issue_defs[array_index_nospec(reg->filter.opcode, IORING_OP_LAST)]; + + /* same size, always ok */ + ret = 0; + if (reg->filter.pdu_size == def->filter_pdu_size) + ; + /* size differs, fail in strict mode */ + else if (reg->filter.flags & IO_URING_BPF_FILTER_SZ_STRICT) + ret = -EMSGSIZE; + /* userspace filter is bigger, always disallow */ + else if (reg->filter.pdu_size > def->filter_pdu_size) + ret = -EMSGSIZE; + + /* copy back kernel filter size */ + reg->filter.pdu_size = def->filter_pdu_size; + if (copy_to_user(&arg->filter, ®->filter, sizeof(reg->filter))) + return -EFAULT; + + return ret; +} int io_register_bpf_filter(struct io_restriction *res, struct io_uring_bpf __user *arg) @@ -320,23 +367,9 @@ int io_register_bpf_filter(struct io_restriction *res, struct sock_fprog fprog; int ret; - if (copy_from_user(®, arg, sizeof(reg))) - return -EFAULT; - if (reg.cmd_type != IO_URING_BPF_CMD_FILTER) - return -EINVAL; - if (reg.cmd_flags || reg.resv) - return -EINVAL; - - if (reg.filter.opcode >= IORING_OP_LAST) - return -EINVAL; - if (reg.filter.flags & ~IO_URING_BPF_FILTER_FLAGS) - return -EINVAL; - if (reg.filter.resv) - return -EINVAL; - if (!mem_is_zero(reg.filter.resv2, sizeof(reg.filter.resv2))) - return -EINVAL; - if (!reg.filter.filter_len || reg.filter.filter_len > BPF_MAXINSNS) - return -EINVAL; + ret = io_bpf_filter_import(®, arg); + if (ret) + return ret; fprog.len = reg.filter.filter_len; fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr);