From bdb489adca295a14750c7343ddb035830fc033b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Mon, 16 Feb 2026 16:03:53 +0000 Subject: [PATCH 01/58] io_uring/cmd_net: split ioctl code out of io_uring_cmd_sock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit io_uring_cmd_sock() originally supported two ioctl-based cmd_op operations. Over time, additional operations were added with tail calls to their helpers. This approach resulted in the new operations sharing an ioctl check with the original operations. io_uring_cmd_sock() now supports 6 operations, so let's move the implementation of the original two into their own helper, reducing io_uring_cmd_sock() to a simple dispatcher. Signed-off-by: Asbjørn Sloth Tønnesen Signed-off-by: Jens Axboe --- io_uring/cmd_net.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index 125a81c520a6..7cd411fc4f33 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -7,6 +7,21 @@ #include "uring_cmd.h" #include "io_uring.h" +static int io_uring_cmd_get_sock_ioctl(struct socket *sock, int op) +{ + struct sock *sk = sock->sk; + struct proto *prot = READ_ONCE(sk->sk_prot); + int ret, arg = 0; + + if (!prot || !prot->ioctl) + return -EOPNOTSUPP; + + ret = prot->ioctl(sk, op, &arg); + if (ret) + return ret; + return arg; +} + static inline int io_uring_cmd_getsockopt(struct socket *sock, struct io_uring_cmd *cmd, unsigned int issue_flags) @@ -156,27 +171,12 @@ static int io_uring_cmd_getsockname(struct socket *sock, int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct socket *sock = cmd->file->private_data; - struct sock *sk = sock->sk; - struct proto *prot = READ_ONCE(sk->sk_prot); - int ret, arg = 0; switch (cmd->cmd_op) { case SOCKET_URING_OP_SIOCINQ: - if (!prot || !prot->ioctl) - return -EOPNOTSUPP; - - ret = prot->ioctl(sk, SIOCINQ, &arg); - if (ret) - return ret; - return arg; + return io_uring_cmd_get_sock_ioctl(sock, SIOCINQ); case SOCKET_URING_OP_SIOCOUTQ: - if (!prot || !prot->ioctl) - return -EOPNOTSUPP; - - ret = prot->ioctl(sk, SIOCOUTQ, &arg); - if (ret) - return ret; - return arg; + return io_uring_cmd_get_sock_ioctl(sock, SIOCOUTQ); case SOCKET_URING_OP_GETSOCKOPT: return io_uring_cmd_getsockopt(sock, cmd, issue_flags); case SOCKET_URING_OP_SETSOCKOPT: From c279fcd95ae136c9dccccc8b7f5069f651449e58 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 16 Feb 2026 11:45:53 +0000 Subject: [PATCH 02/58] io_uring/zctx: rename flags var for more clarity The name "flags" is too overloaded, so rename the variable in io_sendmsg_zc() into msg_flags to stress that it contains MSG_*. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/net.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index d27adbe3f20b..07f002c1d7df 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1526,7 +1526,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; - unsigned flags; + unsigned msg_flags; int ret, min_ret = 0; if (req->flags & REQ_F_IMPORT_BUFFER) { @@ -1552,21 +1552,21 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; - flags = sr->msg_flags; + msg_flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) + msg_flags |= MSG_DONTWAIT; + if (msg_flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); kmsg->msg.msg_control_user = sr->msg_control; kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); + ret = __sys_sendmsg_sock(sock, &kmsg->msg, msg_flags); if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return -EAGAIN; - if (ret > 0 && io_net_retry(sock, flags)) { + if (ret > 0 && io_net_retry(sock, msg_flags)) { sr->done_io += ret; return -EAGAIN; } From 2f9965f5d5de022e2040231712f36d4e93d8a3df Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 16 Feb 2026 11:45:54 +0000 Subject: [PATCH 03/58] io_uring/zctx: move vec regbuf import into io_send_zc_import Unify send and sendmsg zerocopy paths for importing registered buffers and make io_send_zc_import() responsible for that. It's a preparation patch making the next change simpler. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/net.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 07f002c1d7df..9452793c21f1 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1441,17 +1441,34 @@ static int io_sg_from_iter(struct sk_buff *skb, return ret; } -static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) +static int io_send_zc_import(struct io_kiocb *req, + struct io_async_msghdr *kmsg, + unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg = req->async_data; + struct io_kiocb *notif = sr->notif; + int ret; WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF)); - sr->notif->buf_index = req->buf_index; - return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, - (u64)(uintptr_t)sr->buf, sr->len, - ITER_SOURCE, issue_flags); + notif->buf_index = req->buf_index; + + if (req->opcode == IORING_OP_SEND_ZC) { + ret = io_import_reg_buf(notif, &kmsg->msg.msg_iter, + (u64)(uintptr_t)sr->buf, sr->len, + ITER_SOURCE, issue_flags); + } else { + unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; + + ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, + notif, &kmsg->vec, uvec_segs, + issue_flags); + } + + if (unlikely(ret)) + return ret; + req->flags &= ~REQ_F_IMPORT_BUFFER; + return 0; } int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) @@ -1473,8 +1490,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (req->flags & REQ_F_IMPORT_BUFFER) { - req->flags &= ~REQ_F_IMPORT_BUFFER; - ret = io_send_zc_import(req, issue_flags); + ret = io_send_zc_import(req, kmsg, issue_flags); if (unlikely(ret)) return ret; } @@ -1530,16 +1546,9 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) int ret, min_ret = 0; if (req->flags & REQ_F_IMPORT_BUFFER) { - unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; - int ret; - - sr->notif->buf_index = req->buf_index; - ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, - sr->notif, &kmsg->vec, uvec_segs, - issue_flags); + ret = io_send_zc_import(req, kmsg, issue_flags); if (unlikely(ret)) return ret; - req->flags &= ~REQ_F_IMPORT_BUFFER; } sock = sock_from_file(req->file); From 403fec55bf938bde4f8361b1afa135c853ca5ef9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 16 Feb 2026 11:45:55 +0000 Subject: [PATCH 04/58] io_uring/zctx: unify zerocopy issue variants io_send_zc and io_sendmsg_zc started different but now the only real difference between them is how registered buffers are imported and which net helper we use. Avoid duplication and combine them into a single function. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/net.c | 92 +++++++++--------------------------------------- io_uring/net.h | 1 - io_uring/opdef.c | 2 +- 3 files changed, 17 insertions(+), 78 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 9452793c21f1..3e6112beea88 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1471,72 +1471,6 @@ static int io_send_zc_import(struct io_kiocb *req, return 0; } -int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg = req->async_data; - struct socket *sock; - unsigned msg_flags; - int ret, min_ret = 0; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) - return -EOPNOTSUPP; - - if (!(req->flags & REQ_F_POLLED) && - (zc->flags & IORING_RECVSEND_POLL_FIRST)) - return -EAGAIN; - - if (req->flags & REQ_F_IMPORT_BUFFER) { - ret = io_send_zc_import(req, kmsg, issue_flags); - if (unlikely(ret)) - return ret; - } - - msg_flags = zc->msg_flags; - if (issue_flags & IO_URING_F_NONBLOCK) - msg_flags |= MSG_DONTWAIT; - if (msg_flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); - msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; - - kmsg->msg.msg_flags = msg_flags; - kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; - ret = sock_sendmsg(sock, &kmsg->msg); - - if (unlikely(ret < min_ret)) { - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return -EAGAIN; - - if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { - zc->done_io += ret; - return -EAGAIN; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; - req_set_fail(req); - } - - if (ret >= 0) - ret += zc->done_io; - else if (zc->done_io) - ret = zc->done_io; - - /* - * If we're in io-wq we can't rely on tw ordering guarantees, defer - * flushing notif to io_send_zc_cleanup() - */ - if (!(issue_flags & IO_URING_F_UNLOCKED)) { - io_notif_flush(zc->notif); - zc->notif = NULL; - io_req_msg_cleanup(req, 0); - } - io_req_set_res(req, ret, IORING_CQE_F_MORE); - return IOU_COMPLETE; -} - int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); @@ -1545,37 +1479,43 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) unsigned msg_flags; int ret, min_ret = 0; - if (req->flags & REQ_F_IMPORT_BUFFER) { - ret = io_send_zc_import(req, kmsg, issue_flags); - if (unlikely(ret)) - return ret; - } - sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; + if (req->flags & REQ_F_IMPORT_BUFFER) { + ret = io_send_zc_import(req, kmsg, issue_flags); + if (unlikely(ret)) + return ret; + } + msg_flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) msg_flags |= MSG_DONTWAIT; if (msg_flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); - kmsg->msg.msg_control_user = sr->msg_control; kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; - ret = __sys_sendmsg_sock(sock, &kmsg->msg, msg_flags); + + if (req->opcode == IORING_OP_SEND_ZC) { + msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; + kmsg->msg.msg_flags = msg_flags; + ret = sock_sendmsg(sock, &kmsg->msg); + } else { + kmsg->msg.msg_control_user = sr->msg_control; + ret = __sys_sendmsg_sock(sock, &kmsg->msg, msg_flags); + } if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return -EAGAIN; - if (ret > 0 && io_net_retry(sock, msg_flags)) { + if (ret > 0 && io_net_retry(sock, sr->msg_flags)) { sr->done_io += ret; return -EAGAIN; } diff --git a/io_uring/net.h b/io_uring/net.h index a862960a3bb9..d4d1ddce50e3 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -50,7 +50,6 @@ void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req) int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); -int io_send_zc(struct io_kiocb *req, unsigned int issue_flags); int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags); int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); void io_send_zc_cleanup(struct io_kiocb *req); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 91a23baf415e..645980fa4651 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -437,7 +437,7 @@ const struct io_issue_def io_issue_defs[] = { #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, - .issue = io_send_zc, + .issue = io_sendmsg_zc, #else .prep = io_eopnotsupp_prep, #endif From dc156e0f1aae69659885422364438ff8eb022e09 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Feb 2026 23:29:39 +0000 Subject: [PATCH 05/58] io_uring/zcrx: declare some constants for query Add constants for zcrx features and supported registration flags that can be reused by the query code. I was going to add another registration flag, and this patch helps to avoid duplication and keeps changes specific to zcrx files. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/query.c | 4 ++-- io_uring/zcrx.c | 4 +++- io_uring/zcrx.h | 3 +++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/io_uring/query.c b/io_uring/query.c index 63cc30c9803d..c1704d088374 100644 --- a/io_uring/query.c +++ b/io_uring/query.c @@ -34,12 +34,12 @@ static ssize_t io_query_zcrx(union io_query_data *data) { struct io_uring_query_zcrx *e = &data->zcrx; - e->register_flags = ZCRX_REG_IMPORT; + e->register_flags = ZCRX_SUPPORTED_REG_FLAGS; e->area_flags = IORING_ZCRX_AREA_DMABUF; e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST; e->rq_hdr_size = sizeof(struct io_uring); e->rq_hdr_alignment = L1_CACHE_BYTES; - e->features = ZCRX_FEATURE_RX_PAGE_SIZE; + e->features = ZCRX_FEATURES; e->__resv2 = 0; return sizeof(*e); } diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 62d693287457..73fa82759771 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -778,11 +778,13 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return -EFAULT; if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.zcrx_id) return -EINVAL; + if (reg.flags & ~ZCRX_SUPPORTED_REG_FLAGS) + return -EINVAL; if (reg.flags & ZCRX_REG_IMPORT) return import_zcrx(ctx, arg, ®); if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) return -EFAULT; - if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) + if (reg.if_rxq == -1 || !reg.rq_entries) return -EINVAL; if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { if (!(ctx->flags & IORING_SETUP_CLAMP)) diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 32ab95b2cb81..0ddcf0ee8861 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -8,6 +8,9 @@ #include #include +#define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT) +#define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE) + struct io_zcrx_mem { unsigned long size; bool is_dmabuf; From d9d2455e77d0f36a22b9dbaba8b6354dd1378101 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Feb 2026 23:31:20 +0000 Subject: [PATCH 06/58] io_uring/zcrx: move zcrx uapi into separate header Split out zcrx uapi into a separate file. It'll be easier to manage it this way, and that reduces the size of a not so small io_uring.h. Since there are users that expect that zcrx definitions come with io_uring.h, it includes the new file. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 96 +------------------------ include/uapi/linux/io_uring/zcrx.h | 108 +++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+), 94 deletions(-) create mode 100644 include/uapi/linux/io_uring/zcrx.h diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1ff16141c8a5..17475c2045fb 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -10,6 +10,8 @@ #include #include +#include + /* * this file is shared with liburing and that has to autodetect * if linux/time_types.h is available or not, it can @@ -1050,100 +1052,6 @@ struct io_timespec { __u64 tv_nsec; }; -/* Zero copy receive refill queue entry */ -struct io_uring_zcrx_rqe { - __u64 off; - __u32 len; - __u32 __pad; -}; - -struct io_uring_zcrx_cqe { - __u64 off; - __u64 __pad; -}; - -/* The bit from which area id is encoded into offsets */ -#define IORING_ZCRX_AREA_SHIFT 48 -#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) - -struct io_uring_zcrx_offsets { - __u32 head; - __u32 tail; - __u32 rqes; - __u32 __resv2; - __u64 __resv[2]; -}; - -enum io_uring_zcrx_area_flags { - IORING_ZCRX_AREA_DMABUF = 1, -}; - -struct io_uring_zcrx_area_reg { - __u64 addr; - __u64 len; - __u64 rq_area_token; - __u32 flags; - __u32 dmabuf_fd; - __u64 __resv2[2]; -}; - -enum zcrx_reg_flags { - ZCRX_REG_IMPORT = 1, -}; - -enum zcrx_features { - /* - * The user can ask for the desired rx page size by passing the - * value in struct io_uring_zcrx_ifq_reg::rx_buf_len. - */ - ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0, -}; - -/* - * Argument for IORING_REGISTER_ZCRX_IFQ - */ -struct io_uring_zcrx_ifq_reg { - __u32 if_idx; - __u32 if_rxq; - __u32 rq_entries; - __u32 flags; - - __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ - __u64 region_ptr; /* struct io_uring_region_desc * */ - - struct io_uring_zcrx_offsets offsets; - __u32 zcrx_id; - __u32 rx_buf_len; - __u64 __resv[3]; -}; - -enum zcrx_ctrl_op { - ZCRX_CTRL_FLUSH_RQ, - ZCRX_CTRL_EXPORT, - - __ZCRX_CTRL_LAST, -}; - -struct zcrx_ctrl_flush_rq { - __u64 __resv[6]; -}; - -struct zcrx_ctrl_export { - __u32 zcrx_fd; - __u32 __resv1[11]; -}; - -struct zcrx_ctrl { - __u32 zcrx_id; - __u32 op; /* see enum zcrx_ctrl_op */ - __u64 __resv[2]; - - union { - struct zcrx_ctrl_export zc_export; - struct zcrx_ctrl_flush_rq zc_flush; - }; -}; - #ifdef __cplusplus } #endif diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uring/zcrx.h new file mode 100644 index 000000000000..3163a4b8aeb0 --- /dev/null +++ b/include/uapi/linux/io_uring/zcrx.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ +/* + * Header file for the io_uring zerocopy receive (zcrx) interface. + * + * Copyright (C) 2026 Pavel Begunkov + * Copyright (C) 2026 David Wei + * Copyright (C) Meta Platforms, Inc. + */ +#ifndef LINUX_IO_ZCRX_H +#define LINUX_IO_ZCRX_H + +#include + +/* Zero copy receive refill queue entry */ +struct io_uring_zcrx_rqe { + __u64 off; + __u32 len; + __u32 __pad; +}; + +struct io_uring_zcrx_cqe { + __u64 off; + __u64 __pad; +}; + +/* The bit from which area id is encoded into offsets */ +#define IORING_ZCRX_AREA_SHIFT 48 +#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) + +struct io_uring_zcrx_offsets { + __u32 head; + __u32 tail; + __u32 rqes; + __u32 __resv2; + __u64 __resv[2]; +}; + +enum io_uring_zcrx_area_flags { + IORING_ZCRX_AREA_DMABUF = 1, +}; + +struct io_uring_zcrx_area_reg { + __u64 addr; + __u64 len; + __u64 rq_area_token; + __u32 flags; + __u32 dmabuf_fd; + __u64 __resv2[2]; +}; + +enum zcrx_reg_flags { + ZCRX_REG_IMPORT = 1, +}; + +enum zcrx_features { + /* + * The user can ask for the desired rx page size by passing the + * value in struct io_uring_zcrx_ifq_reg::rx_buf_len. + */ + ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0, +}; + +/* + * Argument for IORING_REGISTER_ZCRX_IFQ + */ +struct io_uring_zcrx_ifq_reg { + __u32 if_idx; + __u32 if_rxq; + __u32 rq_entries; + __u32 flags; + + __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ + __u64 region_ptr; /* struct io_uring_region_desc * */ + + struct io_uring_zcrx_offsets offsets; + __u32 zcrx_id; + __u32 rx_buf_len; + __u64 __resv[3]; +}; + +enum zcrx_ctrl_op { + ZCRX_CTRL_FLUSH_RQ, + ZCRX_CTRL_EXPORT, + + __ZCRX_CTRL_LAST, +}; + +struct zcrx_ctrl_flush_rq { + __u64 __resv[6]; +}; + +struct zcrx_ctrl_export { + __u32 zcrx_fd; + __u32 __resv1[11]; +}; + +struct zcrx_ctrl { + __u32 zcrx_id; + __u32 op; /* see enum zcrx_ctrl_op */ + __u64 __resv[2]; + + union { + struct zcrx_ctrl_export zc_export; + struct zcrx_ctrl_flush_rq zc_flush; + }; +}; + +#endif /* LINUX_IO_ZCRX_H */ From 484ae637a3e3d909718de7c07afd3bb34b6b8504 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 2 Mar 2026 13:10:34 +0000 Subject: [PATCH 07/58] io_uring/timeout: check unused sqe fields Zero check unused SQE fields addr3 and pad2 for timeout and timeout update requests. They're not needed now, but could be used sometime in the future. Cc: stable@vger.kernel.org Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/timeout.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index cb61d4862fc6..e3815e3465dd 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -449,6 +449,8 @@ int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; + if (sqe->addr3 || sqe->__pad2[0]) + return -EINVAL; if (sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; @@ -521,6 +523,8 @@ static int __io_timeout_prep(struct io_kiocb *req, unsigned flags; u32 off = READ_ONCE(sqe->off); + if (sqe->addr3 || sqe->__pad2[0]) + return -EINVAL; if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) return -EINVAL; if (off && is_timeout_link) From 6e3f5943a49b1593921fd340ff1dffba851c2afd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 2 Mar 2026 13:10:35 +0000 Subject: [PATCH 08/58] io_uring/timeout: add helper for parsing user time There is some duplication for timespec checks that can be deduplicated with a new function, and it'll be extended in next patches. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/timeout.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index e3815e3465dd..f6520599e3e8 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -35,6 +35,18 @@ struct io_timeout_rem { bool ltimeout; }; +static int io_parse_user_time(struct timespec64 *ts_out, u64 arg) +{ + struct timespec64 ts; + + if (get_timespec64(&ts, u64_to_user_ptr(arg))) + return -EFAULT; + if (ts.tv_sec < 0 || ts.tv_nsec < 0) + return -EINVAL; + *ts_out = ts; + return 0; +} + static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, struct io_kiocb *link); @@ -446,6 +458,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem); + int ret; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; @@ -464,10 +477,9 @@ int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) tr->ltimeout = true; if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL; - if (get_timespec64(&tr->ts, u64_to_user_ptr(READ_ONCE(sqe->addr2)))) - return -EFAULT; - if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) - return -EINVAL; + ret = io_parse_user_time(&tr->ts, READ_ONCE(sqe->addr2)); + if (ret) + return ret; } else if (tr->flags) { /* timeout removal doesn't support flags */ return -EINVAL; @@ -522,6 +534,7 @@ static int __io_timeout_prep(struct io_kiocb *req, struct io_timeout_data *data; unsigned flags; u32 off = READ_ONCE(sqe->off); + int ret; if (sqe->addr3 || sqe->__pad2[0]) return -EINVAL; @@ -561,11 +574,9 @@ static int __io_timeout_prep(struct io_kiocb *req, data->req = req; data->flags = flags; - if (get_timespec64(&data->ts, u64_to_user_ptr(READ_ONCE(sqe->addr)))) - return -EFAULT; - - if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) - return -EINVAL; + ret = io_parse_user_time(&data->ts, READ_ONCE(sqe->addr)); + if (ret) + return ret; data->mode = io_translate_timeout_mode(flags); From 0e78aa188cbddc6311ff24938b1c8d3850b35708 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 2 Mar 2026 13:10:36 +0000 Subject: [PATCH 09/58] io_uring/timeout: migrate reqs from ts64 to ktime It'll be more convenient for next patches to keep ktime in requests instead of timespec64. Convert everything to ktime right after user argument parsing at request prep time. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/timeout.c | 31 +++++++++++++++---------------- io_uring/timeout.h | 2 +- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index f6520599e3e8..4b67746ea3ca 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -30,12 +30,12 @@ struct io_timeout_rem { u64 addr; /* timeout update */ - struct timespec64 ts; + ktime_t time; u32 flags; bool ltimeout; }; -static int io_parse_user_time(struct timespec64 *ts_out, u64 arg) +static int io_parse_user_time(ktime_t *time, u64 arg) { struct timespec64 ts; @@ -43,7 +43,7 @@ static int io_parse_user_time(struct timespec64 *ts_out, u64 arg) return -EFAULT; if (ts.tv_sec < 0 || ts.tv_nsec < 0) return -EINVAL; - *ts_out = ts; + *time = timespec64_to_ktime(ts); return 0; } @@ -92,7 +92,7 @@ static void io_timeout_complete(struct io_tw_req tw_req, io_tw_token_t tw) /* re-arm timer */ raw_spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + hrtimer_start(&data->timer, data->time, data->mode); raw_spin_unlock_irq(&ctx->timeout_lock); return; } @@ -407,7 +407,7 @@ static clockid_t io_timeout_get_clock(struct io_timeout_data *data) } static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) + ktime_t ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; @@ -429,12 +429,12 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, if (hrtimer_try_to_cancel(&io->timer) == -1) return -EALREADY; hrtimer_setup(&io->timer, io_link_timeout_fn, io_timeout_get_clock(io), mode); - hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); + hrtimer_start(&io->timer, ts, mode); return 0; } static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) + ktime_t time, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { struct io_cancel_data cd = { .ctx = ctx, .data = user_data, }; @@ -447,11 +447,11 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, timeout->off = 0; /* noseq */ data = req->async_data; - data->ts = *ts; + data->time = time; list_add_tail(&timeout->list, &ctx->timeout_list); hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), mode); - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode); + hrtimer_start(&data->timer, data->time, mode); return 0; } @@ -477,7 +477,7 @@ int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) tr->ltimeout = true; if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL; - ret = io_parse_user_time(&tr->ts, READ_ONCE(sqe->addr2)); + ret = io_parse_user_time(&tr->time, READ_ONCE(sqe->addr2)); if (ret) return ret; } else if (tr->flags) { @@ -514,9 +514,9 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) raw_spin_lock_irq(&ctx->timeout_lock); if (tr->ltimeout) - ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); + ret = io_linked_timeout_update(ctx, tr->addr, tr->time, mode); else - ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); + ret = io_timeout_update(ctx, tr->addr, tr->time, mode); raw_spin_unlock_irq(&ctx->timeout_lock); } @@ -574,7 +574,7 @@ static int __io_timeout_prep(struct io_kiocb *req, data->req = req; data->flags = flags; - ret = io_parse_user_time(&data->ts, READ_ONCE(sqe->addr)); + ret = io_parse_user_time(&data->time, READ_ONCE(sqe->addr)); if (ret) return ret; @@ -652,7 +652,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) } add: list_add(&timeout->list, entry); - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + hrtimer_start(&data->timer, data->time, data->mode); raw_spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; } @@ -670,8 +670,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) if (timeout->head) { struct io_timeout_data *data = req->async_data; - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), - data->mode); + hrtimer_start(&data->timer, data->time, data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); } raw_spin_unlock_irq(&ctx->timeout_lock); diff --git a/io_uring/timeout.h b/io_uring/timeout.h index 2b7c9ad72992..1620f94dd45a 100644 --- a/io_uring/timeout.h +++ b/io_uring/timeout.h @@ -3,7 +3,7 @@ struct io_timeout_data { struct io_kiocb *req; struct hrtimer timer; - struct timespec64 ts; + ktime_t time; enum hrtimer_mode mode; u32 flags; }; From d8345a21902af5d754f2c2aadf877de989e3cac3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 2 Mar 2026 13:10:37 +0000 Subject: [PATCH 10/58] io_uring/timeout: immediate timeout arg One the things the user has always keep in mind is that any user pointers they put into an SQE is not going to be read by the kernel until submission happens, and the user has to ensure the pointee stays alive until then. For example, snippet below will lead to UAF of the on stack variable ts. Instead of passing the timeout value as a pointer allow to store it immediately in the SQE. The user has to set a new flag called IORING_TIMEOUT_IMMEDIATE_ARG, in which case sqe->addr for timeout or sqe->addr2 for timeout update requests will be interpreted as a time value in nanosecods. void prep_timeout(struct io_uring_sqe *sqe) { struct __kernel_timespec ts = {...}; prep_timeout(sqe, &ts); } void submit() { sqe = get_sqe(); prep_timeout(sqe); io_uring_submit(); } Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 +++++ io_uring/timeout.c | 20 +++++++++++++++----- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 17475c2045fb..17ac1b785440 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -343,6 +343,10 @@ enum io_uring_op { /* * sqe->timeout_flags + * + * IORING_TIMEOUT_IMMEDIATE_ARG: If set, sqe->addr stores the timeout + * value in nanoseconds instead of + * pointing to a timespec. */ #define IORING_TIMEOUT_ABS (1U << 0) #define IORING_TIMEOUT_UPDATE (1U << 1) @@ -351,6 +355,7 @@ enum io_uring_op { #define IORING_LINK_TIMEOUT_UPDATE (1U << 4) #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) #define IORING_TIMEOUT_MULTISHOT (1U << 6) +#define IORING_TIMEOUT_IMMEDIATE_ARG (1U << 7) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) /* diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 4b67746ea3ca..8eddf8add7a2 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -35,10 +35,17 @@ struct io_timeout_rem { bool ltimeout; }; -static int io_parse_user_time(ktime_t *time, u64 arg) +static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) { struct timespec64 ts; + if (flags & IORING_TIMEOUT_IMMEDIATE_ARG) { + *time = ns_to_ktime(arg); + if (*time < 0) + return -EINVAL; + return 0; + } + if (get_timespec64(&ts, u64_to_user_ptr(arg))) return -EFAULT; if (ts.tv_sec < 0 || ts.tv_nsec < 0) @@ -475,9 +482,11 @@ int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) tr->ltimeout = true; - if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) + if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK | + IORING_TIMEOUT_ABS | + IORING_TIMEOUT_IMMEDIATE_ARG)) return -EINVAL; - ret = io_parse_user_time(&tr->time, READ_ONCE(sqe->addr2)); + ret = io_parse_user_time(&tr->time, READ_ONCE(sqe->addr2), tr->flags); if (ret) return ret; } else if (tr->flags) { @@ -545,7 +554,8 @@ static int __io_timeout_prep(struct io_kiocb *req, flags = READ_ONCE(sqe->timeout_flags); if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | IORING_TIMEOUT_ETIME_SUCCESS | - IORING_TIMEOUT_MULTISHOT)) + IORING_TIMEOUT_MULTISHOT | + IORING_TIMEOUT_IMMEDIATE_ARG)) return -EINVAL; /* more than one clock specified is invalid, obviously */ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) @@ -574,7 +584,7 @@ static int __io_timeout_prep(struct io_kiocb *req, data->req = req; data->flags = flags; - ret = io_parse_user_time(&data->time, READ_ONCE(sqe->addr)); + ret = io_parse_user_time(&data->time, READ_ONCE(sqe->addr), flags); if (ret) return ret; From dce00c83035b880deebf7b2f0a204f740cb90d64 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 3 Mar 2026 12:32:19 +0000 Subject: [PATCH 11/58] io_uring/net: allow vectorised regbuf send zc Enable IORING_SEND_VECTORIZED with registered buffers for IORING_OP_SEND_ZC. Set IORING_SEND_VECTORIZED for all msg send requests to differentiate if the vectorised version is expected. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/net.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 3e6112beea88..3e68593e8164 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -375,10 +375,13 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) kmsg->msg.msg_namelen = addr_len; } if (sr->flags & IORING_RECVSEND_FIXED_BUF) { - if (sr->flags & IORING_SEND_VECTORIZED) - return -EINVAL; - req->flags |= REQ_F_IMPORT_BUFFER; - return 0; + if (!(sr->flags & IORING_SEND_VECTORIZED)) { + req->flags |= REQ_F_IMPORT_BUFFER; + return 0; + } + + kmsg->msg.msg_iter.nr_segs = sr->len; + return io_prep_reg_iovec(req, &kmsg->vec, sr->buf, sr->len); } if (req->flags & REQ_F_BUFFER_SELECT) return 0; @@ -396,6 +399,7 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe struct user_msghdr msg; int ret; + sr->flags |= IORING_SEND_VECTORIZED; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); if (unlikely(ret)) @@ -1453,7 +1457,7 @@ static int io_send_zc_import(struct io_kiocb *req, notif->buf_index = req->buf_index; - if (req->opcode == IORING_OP_SEND_ZC) { + if (!(sr->flags & IORING_SEND_VECTORIZED)) { ret = io_import_reg_buf(notif, &kmsg->msg.msg_iter, (u64)(uintptr_t)sr->buf, sr->len, ITER_SOURCE, issue_flags); From cb9487333652b2cfb4f10ef596fc5b675241cae9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 16 Feb 2026 11:48:52 +0000 Subject: [PATCH 12/58] io_uring/zctx: separate notification user_data People previously asked for the notification CQE to have a different user_data value from the main request completion. It's useful to separate buffer and request handling logic and avoid separately refcounting the request. Let the user pass the notification user_data in sqe->addr3. If zero, it'll inherit sqe->user_data as before. It doesn't change the rules for when the user can expect a notification CQE, and it should still check the IORING_CQE_F_MORE flag. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/net.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 3e68593e8164..3f9d08b78c21 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1333,11 +1333,12 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_ring_ctx *ctx = req->ctx; struct io_async_msghdr *iomsg; struct io_kiocb *notif; + u64 user_data; int ret; zc->done_io = 0; - if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) + if (unlikely(READ_ONCE(sqe->__pad2[0]))) return -EINVAL; /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ if (req->flags & REQ_F_CQE_SKIP) @@ -1346,7 +1347,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) notif = zc->notif = io_alloc_notif(ctx); if (!notif) return -ENOMEM; - notif->cqe.user_data = req->cqe.user_data; + user_data = READ_ONCE(sqe->addr3); + if (!user_data) + user_data = req->cqe.user_data; + + notif->cqe.user_data = user_data; notif->cqe.res = 0; notif->cqe.flags = IORING_CQE_F_NOTIF; req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY; From f1a424e21c15993db0f9594cda17ef5d516ab3e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 14 Mar 2026 08:41:04 -0600 Subject: [PATCH 13/58] io_uring: switch struct io_ring_ctx internal bitfields to flags Bitfields cannot be set and checked atomically, and this makes it more clear that these are indeed in shared storage and must be checked and set in a sane fashion. This is in preparation for annotating a few of the known racy, but harmless, flags checking. No intended functional changes in this patch. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 34 ++++++++------ io_uring/eventfd.c | 4 +- io_uring/io_uring.c | 82 +++++++++++++++++----------------- io_uring/io_uring.h | 9 ++-- io_uring/msg_ring.c | 2 +- io_uring/register.c | 8 ++-- io_uring/rsrc.c | 8 ++-- io_uring/tctx.c | 2 +- io_uring/timeout.c | 4 +- io_uring/tw.c | 2 +- 10 files changed, 82 insertions(+), 73 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index dd1420bfcb73..0b3f08adc217 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -268,24 +268,30 @@ struct io_alloc_cache { unsigned int init_clear; }; +enum { + IO_RING_F_DRAIN_NEXT = BIT(0), + IO_RING_F_OP_RESTRICTED = BIT(1), + IO_RING_F_REG_RESTRICTED = BIT(2), + IO_RING_F_OFF_TIMEOUT_USED = BIT(3), + IO_RING_F_DRAIN_ACTIVE = BIT(4), + IO_RING_F_HAS_EVFD = BIT(5), + /* all CQEs should be posted only by the submitter task */ + IO_RING_F_TASK_COMPLETE = BIT(6), + IO_RING_F_LOCKLESS_CQ = BIT(7), + IO_RING_F_SYSCALL_IOPOLL = BIT(8), + IO_RING_F_POLL_ACTIVATED = BIT(9), + IO_RING_F_DRAIN_DISABLED = BIT(10), + IO_RING_F_COMPAT = BIT(11), + IO_RING_F_IOWQ_LIMITS_SET = BIT(12), +}; + struct io_ring_ctx { /* const or read-mostly hot data */ struct { + /* ring setup flags */ unsigned int flags; - unsigned int drain_next: 1; - unsigned int op_restricted: 1; - unsigned int reg_restricted: 1; - unsigned int off_timeout_used: 1; - unsigned int drain_active: 1; - unsigned int has_evfd: 1; - /* all CQEs should be posted only by the submitter task */ - unsigned int task_complete: 1; - unsigned int lockless_cq: 1; - unsigned int syscall_iopoll: 1; - unsigned int poll_activated: 1; - unsigned int drain_disabled: 1; - unsigned int compat: 1; - unsigned int iowq_limits_set : 1; + /* internal state flags IO_RING_F_* flags , mostly read-only */ + unsigned int int_flags; struct task_struct *submitter_task; struct io_rings *rings; diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 7482a7dc6b38..3da028500f76 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -148,7 +148,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, spin_unlock(&ctx->completion_lock); ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; + ctx->int_flags |= IO_RING_F_HAS_EVFD; refcount_set(&ev_fd->refs, 1); atomic_set(&ev_fd->ops, 0); rcu_assign_pointer(ctx->io_ev_fd, ev_fd); @@ -162,7 +162,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx) ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); if (ev_fd) { - ctx->has_evfd = false; + ctx->int_flags &= ~IO_RING_F_HAS_EVFD; rcu_assign_pointer(ctx->io_ev_fd, NULL); io_eventfd_put(ev_fd); return 0; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9a37035e76c0..bfeb3bc3849d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -477,17 +477,17 @@ static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (ctx->poll_activated) + if (ctx->int_flags & IO_RING_F_POLL_ACTIVATED) io_poll_wq_wake(ctx); - if (ctx->off_timeout_used) + if (ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED) io_flush_timeouts(ctx); - if (ctx->has_evfd) + if (ctx->int_flags & IO_RING_F_HAS_EVFD) io_eventfd_signal(ctx, true); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) { - if (!ctx->lockless_cq) + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) spin_lock(&ctx->completion_lock); } @@ -500,11 +500,11 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx) static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) { io_commit_cqring(ctx); - if (!ctx->task_complete) { - if (!ctx->lockless_cq) + if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) { + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) spin_unlock(&ctx->completion_lock); /* IOPOLL rings only need to wake up if it's also SQPOLL */ - if (!ctx->syscall_iopoll) + if (!(ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL)) io_cqring_wake(ctx); } io_commit_cqring_flush(ctx); @@ -830,7 +830,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { lockdep_assert_held(&ctx->uring_lock); - lockdep_assert(ctx->lockless_cq); + lockdep_assert(ctx->int_flags & IO_RING_F_LOCKLESS_CQ); if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { struct io_cqe cqe = io_init_cqe(user_data, res, cflags); @@ -860,7 +860,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); - if (!ctx->lockless_cq) { + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) { spin_lock(&ctx->completion_lock); posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); spin_unlock(&ctx->completion_lock); @@ -885,7 +885,7 @@ bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2]) lockdep_assert_held(&ctx->uring_lock); cqe[0].user_data = req->cqe.user_data; - if (!ctx->lockless_cq) { + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) { spin_lock(&ctx->completion_lock); posted = io_fill_cqe_aux32(ctx, cqe); spin_unlock(&ctx->completion_lock); @@ -913,7 +913,7 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. */ - if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) { + if ((ctx->int_flags & IO_RING_F_LOCKLESS_CQ) || (req->flags & REQ_F_REISSUE)) { defer_complete: req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); @@ -1135,7 +1135,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) */ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { - if (ctx->lockless_cq) + if (ctx->int_flags & IO_RING_F_LOCKLESS_CQ) io_cqe_overflow(ctx, &req->cqe, &req->big_cqe); else io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe); @@ -1148,7 +1148,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) INIT_WQ_LIST(&state->compl_reqs); } - if (unlikely(ctx->drain_active)) + if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)) io_queue_deferred(ctx); ctx->submit_state.cq_flush = false; @@ -1344,7 +1344,7 @@ static __cold void io_drain_req(struct io_kiocb *req) list_add_tail(&de->list, &ctx->defer_list); io_queue_deferred(ctx); if (!drain && list_empty(&ctx->defer_list)) - ctx->drain_active = false; + ctx->int_flags &= ~IO_RING_F_DRAIN_ACTIVE; } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, @@ -1655,7 +1655,7 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) } else { /* can't fail with IO_URING_F_INLINE */ io_req_sqe_copy(req, IO_URING_F_INLINE); - if (unlikely(req->ctx->drain_active)) + if (unlikely(req->ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)) io_drain_req(req); else io_queue_iowq(req); @@ -1671,7 +1671,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { - if (!ctx->op_restricted) + if (!(ctx->int_flags & IO_RING_F_OP_RESTRICTED)) return true; if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; @@ -1691,7 +1691,7 @@ static void io_init_drain(struct io_ring_ctx *ctx) { struct io_kiocb *head = ctx->submit_state.link.head; - ctx->drain_active = true; + ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE; if (head) { /* * If we need to drain a request in the middle of a link, drain @@ -1701,7 +1701,7 @@ static void io_init_drain(struct io_ring_ctx *ctx) * link. */ head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; - ctx->drain_next = true; + ctx->int_flags |= IO_RING_F_DRAIN_NEXT; } } @@ -1767,23 +1767,23 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->buf_index = READ_ONCE(sqe->buf_group); } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) - ctx->drain_disabled = true; + ctx->int_flags |= IO_RING_F_DRAIN_DISABLED; if (sqe_flags & IOSQE_IO_DRAIN) { - if (ctx->drain_disabled) + if (ctx->int_flags & IO_RING_F_DRAIN_DISABLED) return io_init_fail_req(req, -EOPNOTSUPP); io_init_drain(ctx); } } - if (unlikely(ctx->op_restricted || ctx->drain_active || ctx->drain_next)) { + if (unlikely(ctx->int_flags & (IO_RING_F_OP_RESTRICTED | IO_RING_F_DRAIN_ACTIVE | IO_RING_F_DRAIN_NEXT))) { if (!io_check_restriction(ctx, req, sqe_flags)) return io_init_fail_req(req, -EACCES); /* knock it to the slow queue path, will be drained there */ - if (ctx->drain_active) + if (ctx->int_flags & IO_RING_F_DRAIN_ACTIVE) req->flags |= REQ_F_FORCE_ASYNC; /* if there is no link, we're at "next" request and need to drain */ - if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { - ctx->drain_next = false; - ctx->drain_active = true; + if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_NEXT) && !ctx->submit_state.link.head) { + ctx->int_flags &= ~IO_RING_F_DRAIN_NEXT; + ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE; req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; } } @@ -2204,7 +2204,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb) poll_wq_task_work); mutex_lock(&ctx->uring_lock); - ctx->poll_activated = true; + ctx->int_flags |= IO_RING_F_POLL_ACTIVATED; mutex_unlock(&ctx->uring_lock); /* @@ -2219,9 +2219,9 @@ __cold void io_activate_pollwq(struct io_ring_ctx *ctx) { spin_lock(&ctx->completion_lock); /* already activated or in progress */ - if (ctx->poll_activated || ctx->poll_wq_task_work.func) + if ((ctx->int_flags & IO_RING_F_POLL_ACTIVATED) || ctx->poll_wq_task_work.func) goto out; - if (WARN_ON_ONCE(!ctx->task_complete)) + if (WARN_ON_ONCE(!(ctx->int_flags & IO_RING_F_TASK_COMPLETE))) goto out; if (!ctx->submitter_task) goto out; @@ -2242,7 +2242,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - if (unlikely(!ctx->poll_activated)) + if (unlikely(!(ctx->int_flags & IO_RING_F_POLL_ACTIVATED))) io_activate_pollwq(ctx); /* * provides mb() which pairs with barrier from wq_has_sleeper @@ -2607,7 +2607,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } if (flags & IORING_ENTER_GETEVENTS) { - if (ctx->syscall_iopoll) + if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) goto iopoll_locked; /* * Ignore errors, we'll soon call io_cqring_wait() and @@ -2622,7 +2622,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (flags & IORING_ENTER_GETEVENTS) { int ret2; - if (ctx->syscall_iopoll) { + if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) { /* * We disallow the app entering submit/complete with * polling, but we still need to lock the ring to @@ -2923,9 +2923,9 @@ static void io_ctx_restriction_clone(struct io_ring_ctx *ctx, if (dst->bpf_filters) WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters); if (dst->op_registered) - ctx->op_restricted = 1; + ctx->int_flags |= IO_RING_F_OP_RESTRICTED; if (dst->reg_registered) - ctx->reg_restricted = 1; + ctx->int_flags |= IO_RING_F_REG_RESTRICTED; } static __cold int io_uring_create(struct io_ctx_config *config) @@ -2952,17 +2952,18 @@ static __cold int io_uring_create(struct io_ctx_config *config) if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL)) - ctx->task_complete = true; + ctx->int_flags |= IO_RING_F_TASK_COMPLETE; - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) - ctx->lockless_cq = true; + if ((ctx->int_flags & IO_RING_F_TASK_COMPLETE) || + (ctx->flags & IORING_SETUP_IOPOLL)) + ctx->int_flags |= IO_RING_F_LOCKLESS_CQ; /* * lazy poll_wq activation relies on ->task_complete for synchronisation * purposes, see io_activate_pollwq() */ - if (!ctx->task_complete) - ctx->poll_activated = true; + if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) + ctx->int_flags |= IO_RING_F_POLL_ACTIVATED; /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user @@ -2972,9 +2973,10 @@ static __cold int io_uring_create(struct io_ctx_config *config) */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) - ctx->syscall_iopoll = 1; + ctx->int_flags |= IO_RING_F_SYSCALL_IOPOLL; - ctx->compat = in_compat_syscall(); + if (in_compat_syscall()) + ctx->int_flags |= IO_RING_F_COMPAT; if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 0fa844faf287..5cb1983043cd 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -211,7 +211,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) if (ctx->flags & IORING_SETUP_IOPOLL) { lockdep_assert_held(&ctx->uring_lock); - } else if (!ctx->task_complete) { + } else if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) { lockdep_assert_held(&ctx->completion_lock); } else if (ctx->submitter_task) { /* @@ -228,7 +228,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) static inline bool io_is_compat(struct io_ring_ctx *ctx) { - return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); + return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->int_flags & IO_RING_F_COMPAT); } static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) @@ -472,8 +472,9 @@ static inline void io_req_complete_defer(struct io_kiocb *req) static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || - ctx->has_evfd || ctx->poll_activated)) + if (unlikely(ctx->int_flags & (IO_RING_F_OFF_TIMEOUT_USED | + IO_RING_F_HAS_EVFD | + IO_RING_F_POLL_ACTIVATED))) __io_commit_cqring_flush(ctx); } diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 57ad0085869a..3ff9098573db 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -67,7 +67,7 @@ void io_msg_ring_cleanup(struct io_kiocb *req) static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) { - return target_ctx->task_complete; + return target_ctx->int_flags & IO_RING_F_TASK_COMPLETE; } static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) diff --git a/io_uring/register.c b/io_uring/register.c index 0148735f7711..489a6feaf228 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -184,9 +184,9 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, return ret; } if (ctx->restrictions.op_registered) - ctx->op_restricted = 1; + ctx->int_flags |= IO_RING_F_OP_RESTRICTED; if (ctx->restrictions.reg_registered) - ctx->reg_restricted = 1; + ctx->int_flags |= IO_RING_F_REG_RESTRICTED; return 0; } @@ -384,7 +384,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, for (i = 0; i < ARRAY_SIZE(new_count); i++) if (new_count[i]) ctx->iowq_limits[i] = new_count[i]; - ctx->iowq_limits_set = true; + ctx->int_flags |= IO_RING_F_IOWQ_LIMITS_SET; if (tctx && tctx->io_wq) { ret = io_wq_max_workers(tctx->io_wq, new_count); @@ -725,7 +725,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (ctx->submitter_task && ctx->submitter_task != current) return -EEXIST; - if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { + if ((ctx->int_flags & IO_RING_F_REG_RESTRICTED) && !(ctx->flags & IORING_SETUP_R_DISABLED)) { opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 4fa59bf89bba..52554ed89b11 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -295,7 +295,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, u64 tag = 0; uvec = u64_to_user_ptr(user_data); - iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); if (IS_ERR(iov)) { err = PTR_ERR(iov); break; @@ -319,7 +319,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, i = array_index_nospec(up->offset + done, ctx->buf_table.nr); io_reset_rsrc_node(ctx, &ctx->buf_table, i); ctx->buf_table.nodes[i] = node; - if (ctx->compat) + if (io_is_compat(ctx)) user_data += sizeof(struct compat_iovec); else user_data += sizeof(struct iovec); @@ -883,12 +883,12 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, if (arg) { uvec = (struct iovec __user *) arg; - iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); if (IS_ERR(iov)) { ret = PTR_ERR(iov); break; } - if (ctx->compat) + if (io_is_compat(ctx)) arg += sizeof(struct compat_iovec); else arg += sizeof(struct iovec); diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 7cbcb82aedfb..143de8e990eb 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -121,7 +121,7 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) return ret; tctx = current->io_uring; - if (ctx->iowq_limits_set) { + if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) { unsigned int limits[2] = { ctx->iowq_limits[0], ctx->iowq_limits[1], }; diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 8eddf8add7a2..579fdddac71a 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -566,8 +566,8 @@ static int __io_timeout_prep(struct io_kiocb *req, INIT_LIST_HEAD(&timeout->list); timeout->off = off; - if (unlikely(off && !req->ctx->off_timeout_used)) - req->ctx->off_timeout_used = true; + if (unlikely(off && !(req->ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED))) + req->ctx->int_flags |= IO_RING_F_OFF_TIMEOUT_USED; /* * for multishot reqs w/ fixed nr of repeats, repeats tracks the * remaining nr diff --git a/io_uring/tw.c b/io_uring/tw.c index 2f2b4ac4b126..022fe8753c19 100644 --- a/io_uring/tw.c +++ b/io_uring/tw.c @@ -222,7 +222,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags) if (!head) { io_ctx_mark_taskrun(ctx); - if (ctx->has_evfd) + if (ctx->int_flags & IO_RING_F_HAS_EVFD) io_eventfd_signal(ctx, false); } From 8c557449199e8267bc969ae7e1d70b343b6a646d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 14 Mar 2026 08:46:17 -0600 Subject: [PATCH 14/58] io_uring: mark known and harmless racy ctx->int_flags uses There are a few of these, where flags are read outside of the uring_lock, yet it's harmless to race on them. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/io_uring.h | 7 ++++--- io_uring/tw.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bfeb3bc3849d..fb5a263706be 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2242,7 +2242,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - if (unlikely(!(ctx->int_flags & IO_RING_F_POLL_ACTIVATED))) + if (unlikely(!(data_race(ctx->int_flags) & IO_RING_F_POLL_ACTIVATED))) io_activate_pollwq(ctx); /* * provides mb() which pairs with barrier from wq_has_sleeper diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 5cb1983043cd..91cf67b5d85b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -470,11 +470,12 @@ static inline void io_req_complete_defer(struct io_kiocb *req) wq_list_add_tail(&req->comp_list, &state->compl_reqs); } +#define SHOULD_FLUSH_MASK (IO_RING_F_OFF_TIMEOUT_USED | \ + IO_RING_F_HAS_EVFD | IO_RING_F_POLL_ACTIVATED) + static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->int_flags & (IO_RING_F_OFF_TIMEOUT_USED | - IO_RING_F_HAS_EVFD | - IO_RING_F_POLL_ACTIVATED))) + if (unlikely(data_race(ctx->int_flags) & SHOULD_FLUSH_MASK)) __io_commit_cqring_flush(ctx); } diff --git a/io_uring/tw.c b/io_uring/tw.c index 022fe8753c19..fdff81eebc95 100644 --- a/io_uring/tw.c +++ b/io_uring/tw.c @@ -222,7 +222,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags) if (!head) { io_ctx_mark_taskrun(ctx); - if (ctx->int_flags & IO_RING_F_HAS_EVFD) + if (data_race(ctx->int_flags) & IO_RING_F_HAS_EVFD) io_eventfd_signal(ctx, false); } From 9165dc4fa969b64c2d4396ee4e1546a719978dd1 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 2 Mar 2026 10:29:10 -0700 Subject: [PATCH 15/58] io_uring: add REQ_F_IOPOLL A subsequent commit will allow uring_cmds to files that don't implement ->uring_cmd_iopoll() to be issued to IORING_SETUP_IOPOLL io_urings. This means the ctx's IORING_SETUP_IOPOLL flag isn't sufficient to determine whether a given request needs to be iopolled. Introduce a request flag REQ_F_IOPOLL set in ->issue() if a request needs to be iopolled to completion. Set the flag in io_rw_init_file() and io_uring_cmd() for requests issued to IORING_SETUP_IOPOLL ctxs. Use the request flag instead of IORING_SETUP_IOPOLL in places dealing with a specific request. A future possibility would be to add an option to enable/disable iopoll in the io_uring SQE instead of determining it from IORING_SETUP_IOPOLL. Signed-off-by: Caleb Sander Mateos Reviewed-by: Kanchan Joshi Reviewed-by: Anuj Gupta Link: https://patch.msgid.link/20260302172914.2488599-2-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ io_uring/io_uring.c | 9 ++++----- io_uring/rw.c | 11 ++++++----- io_uring/uring_cmd.c | 5 +++-- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0b3f08adc217..4dbd7083dd54 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -550,6 +550,7 @@ enum { REQ_F_HAS_METADATA_BIT, REQ_F_IMPORT_BUFFER_BIT, REQ_F_SQE_COPIED_BIT, + REQ_F_IOPOLL_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -641,6 +642,8 @@ enum { REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT), /* ->sqe_copy() has been called, if necessary */ REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), + /* request must be iopolled to completion (set in ->issue()) */ + REQ_F_IOPOLL = IO_REQ_FLAG(REQ_F_IOPOLL_BIT), }; struct io_tw_req { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fb5a263706be..a610eaa5fd7c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -356,7 +356,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) static void io_prep_async_work(struct io_kiocb *req) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { req->flags |= REQ_F_CREDS; @@ -378,7 +377,7 @@ static void io_prep_async_work(struct io_kiocb *req) if (should_hash && (req->file->f_flags & O_DIRECT) && (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE)) should_hash = false; - if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) + if (should_hash || (req->flags & REQ_F_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) @@ -1419,7 +1418,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) ret = 0; /* If the op doesn't have a file, we're not polling for it */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) + if ((req->flags & REQ_F_IOPOLL) && def->iopoll_queue) io_iopoll_req_issued(req, issue_flags); } return ret; @@ -1435,7 +1434,7 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) io_tw_lock(req->ctx, tw); WARN_ON_ONCE(!req->file); - if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (WARN_ON_ONCE(req->flags & REQ_F_IOPOLL)) return -EFAULT; ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); @@ -1533,7 +1532,7 @@ void io_wq_submit_work(struct io_wq_work *work) * wait for request slots on the block side. */ if (!needs_poll) { - if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (!(req->flags & REQ_F_IOPOLL)) break; if (io_wq_worker_stopped()) break; diff --git a/io_uring/rw.c b/io_uring/rw.c index 1a5f262734e8..3bdb9914e673 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -504,7 +504,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req) if (!S_ISBLK(mode) && !S_ISREG(mode)) return false; if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && - !(ctx->flags & IORING_SETUP_IOPOLL))) + !(req->flags & REQ_F_IOPOLL))) return false; /* * If ref is dying, we might be running poll reap from the exit work. @@ -640,7 +640,7 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) } } - if (req->ctx->flags & IORING_SETUP_IOPOLL) + if (req->flags & REQ_F_IOPOLL) io_complete_rw_iopoll(&rw->kiocb, ret); else io_complete_rw(&rw->kiocb, ret); @@ -654,7 +654,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { + if (ret >= 0 && !(req->flags & REQ_F_IOPOLL)) { u32 cflags = 0; __io_complete_rw_common(req, ret); @@ -876,6 +876,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; + req->flags |= REQ_F_IOPOLL; kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI; req->iopoll_completed = 0; @@ -963,7 +964,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, if (io_file_can_poll(req)) return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ - if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (!force_nonblock && !(req->flags & REQ_F_IOPOLL)) goto done; /* no retry on NONBLOCK nor RWF_NOWAIT */ if (req->flags & REQ_F_NOWAIT) @@ -1188,7 +1189,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) goto done; if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ - if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) + if (ret2 == -EAGAIN && (req->flags & REQ_F_IOPOLL)) goto ret_eagain; if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index ee7b49f47cb5..b651c63f6e20 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -110,7 +110,7 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, * because iopoll completion data overlaps with the hash_node used * for tracking. */ - if (ctx->flags & IORING_SETUP_IOPOLL) + if (req->flags & REQ_F_IOPOLL) return; if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) { @@ -167,7 +167,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, io_req_set_cqe32_extra(req, res2, 0); } io_req_uring_cleanup(req, issue_flags); - if (req->ctx->flags & IORING_SETUP_IOPOLL) { + if (req->flags & REQ_F_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); } else if (issue_flags & IO_URING_F_COMPLETE_DEFER) { @@ -260,6 +260,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) if (ctx->flags & IORING_SETUP_IOPOLL) { if (!file->f_op->uring_cmd_iopoll) return -EOPNOTSUPP; + req->flags |= REQ_F_IOPOLL; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { From 7995be40deb3ab8b5df7bdf0621f33aa546aefa7 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 2 Mar 2026 10:29:11 -0700 Subject: [PATCH 16/58] io_uring: remove iopoll_queue from struct io_issue_def The opcode iopoll_queue flag is now redundant with REQ_F_IOPOLL. Only io_{read,write}{,_fixed}() and io_uring_cmd() set the REQ_F_IOPOLL flag, and the opcodes with these ->issue() implementations are precisely the ones that set iopoll_queue. So don't bother checking the iopoll_queue flag in io_issue_sqe(). Remove the unused flag from struct io_issue_def. Signed-off-by: Caleb Sander Mateos Reviewed-by: Kanchan Joshi Reviewed-by: Anuj Gupta Link: https://patch.msgid.link/20260302172914.2488599-3-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 3 +-- io_uring/opdef.c | 10 ---------- io_uring/opdef.h | 2 -- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a610eaa5fd7c..64ba359878a1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1417,8 +1417,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (ret == IOU_ISSUE_SKIP_COMPLETE) { ret = 0; - /* If the op doesn't have a file, we're not polling for it */ - if ((req->flags & REQ_F_IOPOLL) && def->iopoll_queue) + if (req->flags & REQ_F_IOPOLL) io_iopoll_req_issued(req, issue_flags); } return ret; diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 645980fa4651..c3ef52b70811 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -67,7 +67,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_readv, @@ -82,7 +81,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_writev, @@ -102,7 +100,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_read_fixed, .issue = io_read_fixed, @@ -116,7 +113,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_write_fixed, .issue = io_write_fixed, @@ -250,7 +246,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_read, .issue = io_read, @@ -264,7 +259,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_write, .issue = io_write, @@ -423,7 +417,6 @@ const struct io_issue_def io_issue_defs[] = { .needs_file = 1, .plug = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_cmd), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, @@ -556,7 +549,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_readv_fixed, @@ -571,7 +563,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_writev_fixed, @@ -593,7 +584,6 @@ const struct io_issue_def io_issue_defs[] = { .needs_file = 1, .plug = 1, .iopoll = 1, - .iopoll_queue = 1, .is_128 = 1, .async_size = sizeof(struct io_async_cmd), .prep = io_uring_cmd_prep, diff --git a/io_uring/opdef.h b/io_uring/opdef.h index faf3955dce8b..667f981e63b0 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -25,8 +25,6 @@ struct io_issue_def { unsigned poll_exclusive : 1; /* skip auditing */ unsigned audit_skip : 1; - /* have to be put into the iopoll list */ - unsigned iopoll_queue : 1; /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ unsigned vectored : 1; /* set to 1 if this opcode uses 128b sqes in a mixed sq */ From 3a5e96d47f7ea37fb6adf37882eec1521f8ca75e Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 2 Mar 2026 10:29:12 -0700 Subject: [PATCH 17/58] io_uring: count CQEs in io_iopoll_check() A subsequent commit will allow uring_cmds that don't use iopoll on IORING_SETUP_IOPOLL io_urings. As a result, CQEs can be posted without setting the iopoll_completed flag for a request in iopoll_list or going through task work. For example, a UBLK_U_IO_FETCH_IO_CMDS command could call io_uring_mshot_cmd_post_cqe() to directly post a CQE. The io_iopoll_check() loop currently only counts completions posted in io_do_iopoll() when determining whether the min_events threshold has been met. It also exits early if there are any existing CQEs before polling, or if any CQEs are posted while running task work. CQEs posted via io_uring_mshot_cmd_post_cqe() or other mechanisms won't be counted against min_events. Explicitly check the available CQEs in each io_iopoll_check() loop iteration to account for CQEs posted in any fashion. Signed-off-by: Caleb Sander Mateos Link: https://patch.msgid.link/20260302172914.2488599-4-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 64ba359878a1..74cd62b44d94 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1186,7 +1186,6 @@ __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) { - unsigned int nr_events = 0; unsigned long check_cq; min_events = min(min_events, ctx->cq_entries); @@ -1229,8 +1228,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) * very same mutex. */ if (list_empty(&ctx->iopoll_list) || io_task_work_pending(ctx)) { - u32 tail = ctx->cached_cq_tail; - (void) io_run_local_work_locked(ctx, min_events); if (task_work_pending(current) || list_empty(&ctx->iopoll_list)) { @@ -1239,7 +1236,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) mutex_lock(&ctx->uring_lock); } /* some requests don't go through iopoll_list */ - if (tail != ctx->cached_cq_tail || list_empty(&ctx->iopoll_list)) + if (list_empty(&ctx->iopoll_list)) break; } ret = io_do_iopoll(ctx, !min_events); @@ -1250,9 +1247,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) return -EINTR; if (need_resched()) break; - - nr_events += ret; - } while (nr_events < min_events); + } while (io_cqring_events(ctx) < min_events); return 0; } From 23475637b0c47e5028817c9fd4dabe8f7409ca6c Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 2 Mar 2026 10:29:13 -0700 Subject: [PATCH 18/58] io_uring/uring_cmd: allow non-iopoll cmds with IORING_SETUP_IOPOLL Currently, creating an io_uring with IORING_SETUP_IOPOLL requires all requests issued to it to support iopoll. This prevents, for example, using ublk zero-copy together with IORING_SETUP_IOPOLL, as ublk zero-copy buffer registrations are performed using a uring_cmd. There's no technical reason why these non-iopoll uring_cmds can't be supported. They will either complete synchronously or via an external mechanism that calls io_uring_cmd_done(), io_uring_cmd_post_mshot_cqe32(), or io_uring_mshot_cmd_post_cqe(), so they don't need to be polled. Allow uring_cmd requests to be issued to IORING_SETUP_IOPOLL io_urings even if their files don't implement ->uring_cmd_iopoll(). For these uring_cmd requests, skip initializing struct io_kiocb's iopoll fields, don't set REQ_F_IOPOLL, and don't set IO_URING_F_IOPOLL in issue_flags. Signed-off-by: Caleb Sander Mateos Reviewed-by: Kanchan Joshi Reviewed-by: Anuj Gupta Link: https://patch.msgid.link/20260302172914.2488599-5-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index b651c63f6e20..7b25dcd9d05f 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -257,9 +257,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) issue_flags |= IO_URING_F_CQE32; if (io_is_compat(ctx)) issue_flags |= IO_URING_F_COMPAT; - if (ctx->flags & IORING_SETUP_IOPOLL) { - if (!file->f_op->uring_cmd_iopoll) - return -EOPNOTSUPP; + if (ctx->flags & IORING_SETUP_IOPOLL && file->f_op->uring_cmd_iopoll) { req->flags |= REQ_F_IOPOLL; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; From f144dbac4b177cfd026e417ab98da518ff3372cb Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 2 Mar 2026 10:29:14 -0700 Subject: [PATCH 19/58] nvme: remove nvme_dev_uring_cmd() IO_URING_F_IOPOLL check nvme_dev_uring_cmd() is part of struct file_operations nvme_dev_fops, which doesn't implement ->uring_cmd_iopoll(). So it won't be called with issue_flags that include IO_URING_F_IOPOLL. Drop the unnecessary IO_URING_F_IOPOLL check in nvme_dev_uring_cmd(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Kanchan Joshi Reviewed-by: Anuj Gupta Link: https://patch.msgid.link/20260302172914.2488599-6-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 8844bbd39515..9597a87cf05d 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -786,10 +786,6 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) struct nvme_ctrl *ctrl = ioucmd->file->private_data; int ret; - /* IOPOLL not supported yet */ - if (issue_flags & IO_URING_F_IOPOLL) - return -EOPNOTSUPP; - ret = nvme_uring_cmd_checks(issue_flags); if (ret) return ret; From 033af2b3eb19c5ed96825572105bca3611635ada Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Feb 2026 12:48:38 +0000 Subject: [PATCH 20/58] io_uring: introduce callback driven main loop The io_uring_enter() has a fixed order of execution: it submits requests, waits for completions, and returns to the user. Allow to optionally replace it with a custom loop driven by a callback called loop_step. The basic requirements to the callback is that it should be able to submit requests, wait for completions, parse them and repeat. Most of the communication including parameter passing can be implemented via shared memory. The callback should return IOU_LOOP_CONTINUE to continue execution or IOU_LOOP_STOP to return to the user space. Note that the kernel may decide to prematurely terminate it as well, e.g. in case the process was signalled or killed. The hook takes a structure with parameters. It can be used to ask the kernel to wait for CQEs by setting cq_wait_idx to the CQE index it wants to wait for. Spurious wake ups are possible and even likely, the callback is expected to handle it. There will be more parameters in the future like timeout. It can be used with kernel callbacks, for example, as a slow path deprecation mechanism overwiting SQEs and emulating the wanted behaviour, however it's more useful together with BPF programs implemented in following patches. Note that keeping it separately from the normal io_uring wait loop makes things much simpler and cleaner. It keeps it in one place instead of spreading a bunch of checks in different places including disabling the submission path. It holds the lock by default, which is a better fit for BPF synchronisation and the loop execution model. It nicely avoids existing quirks like forced wake ups on timeout request completion. And it should be easier to implement new features. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/a2d369aa1c9dd23ad7edac9220cffc563abcaed6.1772109579.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 ++ io_uring/Makefile | 2 +- io_uring/io_uring.c | 11 ++++ io_uring/loop.c | 91 ++++++++++++++++++++++++++++++++++ io_uring/loop.h | 27 ++++++++++ io_uring/wait.h | 1 + 6 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 io_uring/loop.c create mode 100644 io_uring/loop.h diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 4dbd7083dd54..344b634b8989 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -41,6 +41,8 @@ enum io_uring_cmd_flags { IO_URING_F_COMPAT = (1 << 12), }; +struct iou_loop_params; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -361,6 +363,9 @@ struct io_ring_ctx { struct io_alloc_cache rw_cache; struct io_alloc_cache cmd_cache; + int (*loop_step)(struct io_ring_ctx *ctx, + struct iou_loop_params *); + /* * Any cancelable uring_cmd is added to this list in * ->uring_cmd() by io_uring_cmd_insert_cancelable() diff --git a/io_uring/Makefile b/io_uring/Makefile index 931f9156132a..1c1f47de32a4 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ advise.o openclose.o statx.o timeout.o \ cancel.o waitid.o register.o \ truncate.o memmap.o alloc_cache.o \ - query.o + query.o loop.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 74cd62b44d94..960d36c49ffe 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@ #include "eventfd.h" #include "wait.h" #include "bpf_filter.h" +#include "loop.h" #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -588,6 +589,11 @@ void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } +void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx) +{ + __io_cqring_overflow_flush(ctx, false); +} + /* must to be called somewhat shortly after putting a request */ static inline void io_put_task(struct io_kiocb *req) { @@ -2571,6 +2577,11 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED)) goto out; + if (io_has_loop_ops(ctx)) { + ret = io_run_loop(ctx); + goto out; + } + /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if diff --git a/io_uring/loop.c b/io_uring/loop.c new file mode 100644 index 000000000000..31843cc3e451 --- /dev/null +++ b/io_uring/loop.c @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include "io_uring.h" +#include "wait.h" +#include "loop.h" + +static inline int io_loop_nr_cqes(const struct io_ring_ctx *ctx, + const struct iou_loop_params *lp) +{ + return lp->cq_wait_idx - READ_ONCE(ctx->rings->cq.tail); +} + +static inline void io_loop_wait_start(struct io_ring_ctx *ctx, unsigned nr_wait) +{ + atomic_set(&ctx->cq_wait_nr, nr_wait); + set_current_state(TASK_INTERRUPTIBLE); +} + +static inline void io_loop_wait_finish(struct io_ring_ctx *ctx) +{ + __set_current_state(TASK_RUNNING); + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); +} + +static void io_loop_wait(struct io_ring_ctx *ctx, struct iou_loop_params *lp, + unsigned nr_wait) +{ + io_loop_wait_start(ctx, nr_wait); + + if (unlikely(io_local_work_pending(ctx) || + io_loop_nr_cqes(ctx, lp) <= 0) || + READ_ONCE(ctx->check_cq)) { + io_loop_wait_finish(ctx); + return; + } + + mutex_unlock(&ctx->uring_lock); + schedule(); + io_loop_wait_finish(ctx); + mutex_lock(&ctx->uring_lock); +} + +static int __io_run_loop(struct io_ring_ctx *ctx) +{ + struct iou_loop_params lp = {}; + + while (true) { + int nr_wait, step_res; + + if (unlikely(!ctx->loop_step)) + return -EFAULT; + + step_res = ctx->loop_step(ctx, &lp); + if (step_res == IOU_LOOP_STOP) + break; + if (step_res != IOU_LOOP_CONTINUE) + return -EINVAL; + + nr_wait = io_loop_nr_cqes(ctx, &lp); + if (nr_wait > 0) + io_loop_wait(ctx, &lp, nr_wait); + else + nr_wait = 0; + + if (task_work_pending(current)) { + mutex_unlock(&ctx->uring_lock); + io_run_task_work(); + mutex_lock(&ctx->uring_lock); + } + if (unlikely(task_sigpending(current))) + return -EINTR; + io_run_local_work_locked(ctx, nr_wait); + + if (READ_ONCE(ctx->check_cq) & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + io_cqring_overflow_flush_locked(ctx); + } + + return 0; +} + +int io_run_loop(struct io_ring_ctx *ctx) +{ + int ret; + + if (!io_allowed_run_tw(ctx)) + return -EEXIST; + + mutex_lock(&ctx->uring_lock); + ret = __io_run_loop(ctx); + mutex_unlock(&ctx->uring_lock); + return ret; +} diff --git a/io_uring/loop.h b/io_uring/loop.h new file mode 100644 index 000000000000..d7718b9ce61e --- /dev/null +++ b/io_uring/loop.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_LOOP_H +#define IOU_LOOP_H + +#include + +struct iou_loop_params { + /* + * The CQE index to wait for. Only serves as a hint and can still be + * woken up earlier. + */ + __u32 cq_wait_idx; +}; + +enum { + IOU_LOOP_CONTINUE = 0, + IOU_LOOP_STOP, +}; + +static inline bool io_has_loop_ops(struct io_ring_ctx *ctx) +{ + return data_race(ctx->loop_step); +} + +int io_run_loop(struct io_ring_ctx *ctx); + +#endif diff --git a/io_uring/wait.h b/io_uring/wait.h index 5e236f74e1af..037e512dd80c 100644 --- a/io_uring/wait.h +++ b/io_uring/wait.h @@ -25,6 +25,7 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, struct ext_arg *ext_arg); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx); +void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx); static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { From d0e437b76bd3c979ddaa6205f5e9ad3e0f95faef Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Feb 2026 12:48:39 +0000 Subject: [PATCH 21/58] io_uring/bpf-ops: implement loop_step with BPF struct_ops Introduce io_uring BPF struct ops implementing the loop_step callback, which will allow BPF to overwrite the default io_uring event loop logic. The callback takes an io_uring context, the main role of which is to be passed to io_uring kfuncs. The other argument is a struct iou_loop_params, which BPF can use to request CQ waiting and communicate other parameters. See the event loop description in the previous patch for more details. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/98db437651ce64e9cbeb611c60bf5887259db09f.1772109579.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/Kconfig | 5 ++ io_uring/Makefile | 1 + io_uring/bpf-ops.c | 127 ++++++++++++++++++++++++++++++++++++++++++++ io_uring/bpf-ops.h | 14 +++++ io_uring/io_uring.c | 1 + 5 files changed, 148 insertions(+) create mode 100644 io_uring/bpf-ops.c create mode 100644 io_uring/bpf-ops.h diff --git a/io_uring/Kconfig b/io_uring/Kconfig index a7ae23cf1035..a283d9e53787 100644 --- a/io_uring/Kconfig +++ b/io_uring/Kconfig @@ -14,3 +14,8 @@ config IO_URING_BPF def_bool y depends on BPF depends on NET + +config IO_URING_BPF_OPS + def_bool y + depends on IO_URING + depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF diff --git a/io_uring/Makefile b/io_uring/Makefile index 1c1f47de32a4..c54e328d1410 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -25,3 +25,4 @@ obj-$(CONFIG_NET) += net.o cmd_net.o obj-$(CONFIG_PROC_FS) += fdinfo.o obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o obj-$(CONFIG_IO_URING_BPF) += bpf_filter.o +obj-$(CONFIG_IO_URING_BPF_OPS) += bpf-ops.o diff --git a/io_uring/bpf-ops.c b/io_uring/bpf-ops.c new file mode 100644 index 000000000000..975db5a78188 --- /dev/null +++ b/io_uring/bpf-ops.c @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include + +#include "io_uring.h" +#include "register.h" +#include "bpf-ops.h" +#include "loop.h" + +static const struct btf_type *loop_params_type; + +static int io_bpf_ops__loop_step(struct io_ring_ctx *ctx, + struct iou_loop_params *lp) +{ + return IOU_LOOP_STOP; +} + +static struct io_uring_bpf_ops io_bpf_ops_stubs = { + .loop_step = io_bpf_ops__loop_step, +}; + +static bool bpf_io_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type != BPF_READ) + return false; + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (off % size != 0) + return false; + + return btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_io_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, int off, + int size) +{ + const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); + + if (t == loop_params_type) { + if (off + size <= offsetofend(struct iou_loop_params, cq_wait_idx)) + return SCALAR_VALUE; + } + + return -EACCES; +} + +static const struct bpf_verifier_ops bpf_io_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = bpf_io_is_valid_access, + .btf_struct_access = bpf_io_btf_struct_access, +}; + +static const struct btf_type * +io_lookup_struct_type(struct btf *btf, const char *name) +{ + s32 type_id; + + type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT); + if (type_id < 0) + return NULL; + return btf_type_by_id(btf, type_id); +} + +static int bpf_io_init(struct btf *btf) +{ + loop_params_type = io_lookup_struct_type(btf, "iou_loop_params"); + if (!loop_params_type) { + pr_err("io_uring: Failed to locate iou_loop_params\n"); + return -EINVAL; + } + + return 0; +} + +static int bpf_io_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + return 0; +} + +static int bpf_io_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +static int bpf_io_reg(void *kdata, struct bpf_link *link) +{ + return -EOPNOTSUPP; +} + +static void bpf_io_unreg(void *kdata, struct bpf_link *link) +{ +} + +static struct bpf_struct_ops bpf_ring_ops = { + .verifier_ops = &bpf_io_verifier_ops, + .reg = bpf_io_reg, + .unreg = bpf_io_unreg, + .check_member = bpf_io_check_member, + .init_member = bpf_io_init_member, + .init = bpf_io_init, + .cfi_stubs = &io_bpf_ops_stubs, + .name = "io_uring_bpf_ops", + .owner = THIS_MODULE, +}; + +static int __init io_uring_bpf_init(void) +{ + int ret; + + ret = register_bpf_struct_ops(&bpf_ring_ops, io_uring_bpf_ops); + if (ret) { + pr_err("io_uring: Failed to register struct_ops (%d)\n", ret); + return ret; + } + + return 0; +} +__initcall(io_uring_bpf_init); diff --git a/io_uring/bpf-ops.h b/io_uring/bpf-ops.h new file mode 100644 index 000000000000..e8a08ae2df0a --- /dev/null +++ b/io_uring/bpf-ops.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_BPF_OPS_H +#define IOU_BPF_OPS_H + +#include + +struct io_uring_bpf_ops { + int (*loop_step)(struct io_ring_ctx *ctx, struct iou_loop_params *lp); + + __u32 ring_fd; + void *priv; +}; + +#endif /* IOU_BPF_OPS_H */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 960d36c49ffe..0a80c8e6e633 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -87,6 +87,7 @@ #include "msg_ring.h" #include "memmap.h" #include "zcrx.h" +#include "bpf-ops.h" #include "timeout.h" #include "poll.h" From 890819248a8616558fe12e6c06c918ee1c3a2bc6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Feb 2026 12:48:40 +0000 Subject: [PATCH 22/58] io_uring/bpf-ops: add kfunc helpers Add two kfuncs that should cover most of the needs: 1. bpf_io_uring_submit_sqes(), which allows to submit io_uring requests. It mirrors the normal user space submission path and follows all related io_uring_enter(2) rules. i.e. SQEs are taken from the SQ according to head/tail values. In case of IORING_SETUP_SQ_REWIND, it'll submit first N entries. 2. bpf_io_uring_get_region() returns a pointer to the specified region, where io_uring regions are kernel-userspace shared chunks of memory. It takes the size as an argument, which should be a load time constant. There are 3 types of regions: - IOU_REGION_SQ returns the submission queue. - IOU_REGION_CQ stores the CQ, SQ/CQ headers and the sqarray. In other words, it gives same memory that would normally be mmap'ed with IORING_FEAT_SINGLE_MMAP enabled IORING_OFF_SQ_RING. - IOU_REGION_MEM represents the memory / parameter region. It can be used to store request indirect parameters and for kernel - user communication. It intentionally provides a thin but flexible API and expects BPF programs to implement CQ/SQ header parsing, CQ walking, etc. That mirrors how the normal user space works with rings and should help to minimise kernel / kfunc helpers changes while introducing new generic io_uring features. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/967bcc10e94c796eb273998621551b2a21848cde.1772109579.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/bpf-ops.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++ io_uring/bpf-ops.h | 6 +++++ 2 files changed, 61 insertions(+) diff --git a/io_uring/bpf-ops.c b/io_uring/bpf-ops.c index 975db5a78188..17518f4ecca9 100644 --- a/io_uring/bpf-ops.c +++ b/io_uring/bpf-ops.c @@ -5,11 +5,58 @@ #include "io_uring.h" #include "register.h" +#include "memmap.h" #include "bpf-ops.h" #include "loop.h" static const struct btf_type *loop_params_type; +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_io_uring_submit_sqes(struct io_ring_ctx *ctx, u32 nr) +{ + return io_submit_sqes(ctx, nr); +} + +__bpf_kfunc +__u8 *bpf_io_uring_get_region(struct io_ring_ctx *ctx, __u32 region_id, + const size_t rdwr_buf_size) +{ + struct io_mapped_region *r; + + lockdep_assert_held(&ctx->uring_lock); + + switch (region_id) { + case IOU_REGION_MEM: + r = &ctx->param_region; + break; + case IOU_REGION_CQ: + r = &ctx->ring_region; + break; + case IOU_REGION_SQ: + r = &ctx->sq_region; + break; + default: + return NULL; + } + + if (unlikely(rdwr_buf_size > io_region_size(r))) + return NULL; + return io_region_get_ptr(r); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(io_uring_kfunc_set) +BTF_ID_FLAGS(func, bpf_io_uring_submit_sqes, KF_SLEEPABLE); +BTF_ID_FLAGS(func, bpf_io_uring_get_region, KF_RET_NULL); +BTF_KFUNCS_END(io_uring_kfunc_set) + +static const struct btf_kfunc_id_set bpf_io_uring_kfunc_set = { + .owner = THIS_MODULE, + .set = &io_uring_kfunc_set, +}; + static int io_bpf_ops__loop_step(struct io_ring_ctx *ctx, struct iou_loop_params *lp) { @@ -68,12 +115,20 @@ io_lookup_struct_type(struct btf *btf, const char *name) static int bpf_io_init(struct btf *btf) { + int ret; + loop_params_type = io_lookup_struct_type(btf, "iou_loop_params"); if (!loop_params_type) { pr_err("io_uring: Failed to locate iou_loop_params\n"); return -EINVAL; } + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &bpf_io_uring_kfunc_set); + if (ret) { + pr_err("io_uring: Failed to register kfuncs (%d)\n", ret); + return ret; + } return 0; } diff --git a/io_uring/bpf-ops.h b/io_uring/bpf-ops.h index e8a08ae2df0a..b9e589ad519a 100644 --- a/io_uring/bpf-ops.h +++ b/io_uring/bpf-ops.h @@ -4,6 +4,12 @@ #include +enum { + IOU_REGION_MEM, + IOU_REGION_CQ, + IOU_REGION_SQ, +}; + struct io_uring_bpf_ops { int (*loop_step)(struct io_ring_ctx *ctx, struct iou_loop_params *lp); From 98f37634b12b17ad5c56db8fb63cf9d7dc55d74c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Feb 2026 12:48:41 +0000 Subject: [PATCH 23/58] io_uring/bpf-ops: implement bpf ops registration Implement BPF struct ops registration. It's registered off the BPF path, and can be removed by BPF as well as io_uring. To protect it, introduce a global lock synchronising registration. ctx->uring_lock can be nested under it. ctx->bpf_ops is write protected by both locks and so it's safe to read it under either of them. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/1f46bffd76008de49cbafa2ad77d348810a4f69e.1772109579.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 ++ io_uring/bpf-ops.c | 92 +++++++++++++++++++++++++++++++++- io_uring/bpf-ops.h | 8 +++ io_uring/io_uring.c | 1 + 4 files changed, 104 insertions(+), 2 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 344b634b8989..28e5dbdac55b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -8,6 +8,9 @@ #include #include +struct iou_loop_params; +struct io_uring_bpf_ops; + enum { /* * A hint to not wake right away but delay until there are enough of @@ -488,6 +491,8 @@ struct io_ring_ctx { DECLARE_HASHTABLE(napi_ht, 4); #endif + struct io_uring_bpf_ops *bpf_ops; + /* * Protection for resize vs mmap races - both the mmap and resize * side will need to grab this lock, to prevent either side from diff --git a/io_uring/bpf-ops.c b/io_uring/bpf-ops.c index 17518f4ecca9..e4b244337aa9 100644 --- a/io_uring/bpf-ops.c +++ b/io_uring/bpf-ops.c @@ -5,10 +5,11 @@ #include "io_uring.h" #include "register.h" +#include "loop.h" #include "memmap.h" #include "bpf-ops.h" -#include "loop.h" +static DEFINE_MUTEX(io_bpf_ctrl_mutex); static const struct btf_type *loop_params_type; __bpf_kfunc_start_defs(); @@ -143,16 +144,103 @@ static int bpf_io_init_member(const struct btf_type *t, const struct btf_member *member, void *kdata, const void *udata) { + u32 moff = __btf_member_bit_offset(t, member) / 8; + const struct io_uring_bpf_ops *uops = udata; + struct io_uring_bpf_ops *ops = kdata; + + switch (moff) { + case offsetof(struct io_uring_bpf_ops, ring_fd): + ops->ring_fd = uops->ring_fd; + return 1; + } + return 0; +} + +static int io_install_bpf(struct io_ring_ctx *ctx, struct io_uring_bpf_ops *ops) +{ + if (ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_IOPOLL)) + return -EOPNOTSUPP; + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EOPNOTSUPP; + + if (ctx->bpf_ops) + return -EBUSY; + if (WARN_ON_ONCE(!ops->loop_step)) + return -EINVAL; + + ops->priv = ctx; + ctx->bpf_ops = ops; + ctx->loop_step = ops->loop_step; return 0; } static int bpf_io_reg(void *kdata, struct bpf_link *link) { - return -EOPNOTSUPP; + struct io_uring_bpf_ops *ops = kdata; + struct io_ring_ctx *ctx; + struct file *file; + int ret = -EBUSY; + + file = io_uring_register_get_file(ops->ring_fd, false); + if (IS_ERR(file)) + return PTR_ERR(file); + ctx = file->private_data; + + scoped_guard(mutex, &io_bpf_ctrl_mutex) { + guard(mutex)(&ctx->uring_lock); + ret = io_install_bpf(ctx, ops); + } + + fput(file); + return ret; +} + +static void io_eject_bpf(struct io_ring_ctx *ctx) +{ + struct io_uring_bpf_ops *ops = ctx->bpf_ops; + + if (WARN_ON_ONCE(!ops)) + return; + if (WARN_ON_ONCE(ops->priv != ctx)) + return; + + ops->priv = NULL; + ctx->bpf_ops = NULL; + ctx->loop_step = NULL; } static void bpf_io_unreg(void *kdata, struct bpf_link *link) { + struct io_uring_bpf_ops *ops = kdata; + struct io_ring_ctx *ctx; + + guard(mutex)(&io_bpf_ctrl_mutex); + ctx = ops->priv; + if (ctx) { + guard(mutex)(&ctx->uring_lock); + if (WARN_ON_ONCE(ctx->bpf_ops != ops)) + return; + + io_eject_bpf(ctx); + } +} + +void io_unregister_bpf_ops(struct io_ring_ctx *ctx) +{ + /* + * ->bpf_ops is write protected by io_bpf_ctrl_mutex and uring_lock, + * and read protected by either. Try to avoid taking the global lock + * for rings that never had any bpf installed. + */ + scoped_guard(mutex, &ctx->uring_lock) { + if (!ctx->bpf_ops) + return; + } + + guard(mutex)(&io_bpf_ctrl_mutex); + guard(mutex)(&ctx->uring_lock); + if (ctx->bpf_ops) + io_eject_bpf(ctx); } static struct bpf_struct_ops bpf_ring_ops = { diff --git a/io_uring/bpf-ops.h b/io_uring/bpf-ops.h index b9e589ad519a..b39b3fd3acda 100644 --- a/io_uring/bpf-ops.h +++ b/io_uring/bpf-ops.h @@ -17,4 +17,12 @@ struct io_uring_bpf_ops { void *priv; }; +#ifdef CONFIG_IO_URING_BPF_OPS +void io_unregister_bpf_ops(struct io_ring_ctx *ctx); +#else +static inline void io_unregister_bpf_ops(struct io_ring_ctx *ctx) +{ +} +#endif + #endif /* IOU_BPF_OPS_H */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0a80c8e6e633..d703f0a8b315 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2148,6 +2148,7 @@ static __cold void io_req_caches_free(struct io_ring_ctx *ctx) static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { + io_unregister_bpf_ops(ctx); io_sq_thread_finish(ctx); mutex_lock(&ctx->uring_lock); From 49c21d9a5fcd83b717f2f543734ca15e36d0189e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Mar 2026 14:03:54 -0600 Subject: [PATCH 24/58] io_uring/kbuf: use 'ctx' consistently There's already a local ctx variable, yet the ring lock and unlock helpers use req->ctx. use ctx consistently. Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 26813b0f1dfd..ff81f32d8032 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -225,7 +225,7 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, struct io_br_sel sel = { }; struct io_buffer_list *bl; - io_ring_submit_lock(req->ctx, issue_flags); + io_ring_submit_lock(ctx, issue_flags); bl = io_buffer_get_list(ctx, buf_group); if (likely(bl)) { @@ -234,7 +234,7 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, else sel.addr = io_provided_buffer_select(req, len, bl); } - io_ring_submit_unlock(req->ctx, issue_flags); + io_ring_submit_unlock(ctx, issue_flags); return sel; } From 74dbc0bab57b7e5b4adbc93ce9179e0f36079e4c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Mar 2026 14:10:19 -0600 Subject: [PATCH 25/58] io_uring/poll: cache req->apoll_events Avoid a potential reload of ->apoll_events post vfs_poll() by caching it in a local variable. Signed-off-by: Jens Axboe --- io_uring/poll.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index b671b84657d9..4175e63b9edf 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -276,8 +276,10 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) /* the mask was stashed in __io_poll_execute */ if (!req->cqe.res) { - struct poll_table_struct pt = { ._key = req->apoll_events }; - req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; + __poll_t events = req->apoll_events; + struct poll_table_struct pt = { ._key = events }; + + req->cqe.res = vfs_poll(req->file, &pt) & events; /* * We got woken with a mask, but someone else got to * it first. The above vfs_poll() doesn't add us back @@ -286,7 +288,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) */ if (unlikely(!req->cqe.res)) { /* Multishot armed need not reissue */ - if (!(req->apoll_events & EPOLLONESHOT)) + if (!(events & EPOLLONESHOT)) continue; return IOU_POLL_REISSUE; } From 0a6b9ae1f314c92141b851fcbc2f7b4d0cd2e340 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Mar 2026 14:18:04 -0600 Subject: [PATCH 26/58] io_uring/net: use 'ctx' consistently There's already a local ctx variable, use it for the io_is_compat() check as well. Signed-off-by: Jens Axboe --- io_uring/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index 3f9d08b78c21..b3f73883a24c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1375,7 +1375,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; - if (io_is_compat(req->ctx)) + if (io_is_compat(ctx)) zc->msg_flags |= MSG_CMSG_COMPAT; iomsg = io_msg_alloc_async(req); From 3e97c2582f8450117dfa14cc672437afb31233a0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Mar 2026 14:21:58 -0600 Subject: [PATCH 27/58] io_uring/rw: use cached file rather than req->file In io_rw_init_file(), req->file is cached in file, yet the former is still being used when checking for O_DIRECT. As this is post setting the kiocb flags, the compiler has to reload req->file. Just use the locally cached file instead. Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 3bdb9914e673..046f76a71b9c 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -900,7 +900,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) * We have a union of meta fields with wpq used for buffered-io * in io_async_rw, so fail it here. */ - if (!(req->file->f_flags & O_DIRECT)) + if (!(file->f_flags & O_DIRECT)) return -EOPNOTSUPP; kiocb->ki_flags |= IOCB_HAS_METADATA; kiocb->private = &io->meta; From f41b075492355c60d87ddd66371dcdb1ae9c484e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Mar 2026 14:28:44 -0600 Subject: [PATCH 28/58] io_uring: avoid req->ctx reload in io_req_put_rsrc_nodes() Cache 'ctx' to avoid it needing to get potentially reloaded. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d703f0a8b315..6eaa21e09469 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1073,12 +1073,14 @@ void io_queue_next(struct io_kiocb *req) static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) { + struct io_ring_ctx *ctx = req->ctx; + if (req->file_node) { - io_put_rsrc_node(req->ctx, req->file_node); + io_put_rsrc_node(ctx, req->file_node); req->file_node = NULL; } if (req->flags & REQ_F_BUF_NODE) - io_put_rsrc_node(req->ctx, req->buf_node); + io_put_rsrc_node(ctx, req->buf_node); } static void io_free_batch_list(struct io_ring_ctx *ctx, From e5361d25e241ac3a23177fa74ae91d049bad00d3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:43:50 +0000 Subject: [PATCH 29/58] io_uring/zcrx: return back two step unregistration There are reports where io_uring instance removal takes too long and an ifq reallocation by another zcrx instance fails. Split zcrx destruction into two steps similarly how it was before, first close the queue early but maintain zcrx alive, and then when all inflight requests are completed, drop the main zcrx reference. For extra protection, mark terminated zcrx instances in xarray and warn if we double put them. Cc: stable@vger.kernel.org # 6.19+ Link: https://github.com/axboe/liburing/issues/1550 Reported-by: Youngmin Choi Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/0ce21f0565ab4358668922a28a8a36922dfebf76.1774261953.git.asml.silence@gmail.com [axboe: NULL ifq before break inside scoped guard] Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++++ io_uring/zcrx.c | 46 ++++++++++++++++++++++++++++++++++++++++++--- io_uring/zcrx.h | 4 ++++ 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6eaa21e09469..34104c256c88 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2308,6 +2308,10 @@ static __cold void io_ring_exit_work(struct work_struct *work) struct io_tctx_node *node; int ret; + mutex_lock(&ctx->uring_lock); + io_terminate_zcrx(ctx); + mutex_unlock(&ctx->uring_lock); + /* * If we're doing polled IO and end up having requests being * submitted async (out-of-line), then completions can come in while diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 73fa82759771..615805d2c3dd 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -624,12 +624,17 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) } } -static void zcrx_unregister(struct io_zcrx_ifq *ifq) +static void zcrx_unregister_user(struct io_zcrx_ifq *ifq) { if (refcount_dec_and_test(&ifq->user_refs)) { io_close_queue(ifq); io_zcrx_scrub(ifq); } +} + +static void zcrx_unregister(struct io_zcrx_ifq *ifq) +{ + zcrx_unregister_user(ifq); io_put_zcrx_ifq(ifq); } @@ -887,6 +892,36 @@ static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) return &area->nia.niovs[niov_idx]; } +static inline bool is_zcrx_entry_marked(struct io_ring_ctx *ctx, unsigned long id) +{ + return xa_get_mark(&ctx->zcrx_ctxs, id, XA_MARK_0); +} + +static inline void set_zcrx_entry_mark(struct io_ring_ctx *ctx, unsigned long id) +{ + xa_set_mark(&ctx->zcrx_ctxs, id, XA_MARK_0); +} + +void io_terminate_zcrx(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + unsigned long id = 0; + + lockdep_assert_held(&ctx->uring_lock); + + while (1) { + scoped_guard(mutex, &ctx->mmap_lock) + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); + if (!ifq) + break; + if (WARN_ON_ONCE(is_zcrx_entry_marked(ctx, id))) + break; + set_zcrx_entry_mark(ctx, id); + id++; + zcrx_unregister_user(ifq); + } +} + void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; @@ -898,12 +933,17 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) unsigned long id = 0; ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); - if (ifq) + if (ifq) { + if (WARN_ON_ONCE(!is_zcrx_entry_marked(ctx, id))) { + ifq = NULL; + break; + } xa_erase(&ctx->zcrx_ctxs, id); + } } if (!ifq) break; - zcrx_unregister(ifq); + io_put_zcrx_ifq(ifq); } xa_destroy(&ctx->zcrx_ctxs); diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 0ddcf0ee8861..0316a41a3561 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -74,6 +74,7 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg); int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); +void io_terminate_zcrx(struct io_ring_ctx *ctx); int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct socket *sock, unsigned int flags, unsigned issue_flags, unsigned int *len); @@ -88,6 +89,9 @@ static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { } +static inline void io_terminate_zcrx(struct io_ring_ctx *ctx) +{ +} static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct socket *sock, unsigned int flags, unsigned issue_flags, unsigned int *len) From 41041562a7d6acd5a8ce918be8da7e26337f379f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:43:51 +0000 Subject: [PATCH 30/58] io_uring/zcrx: fully clean area on error in io_import_umem() When accounting fails, io_import_umem() sets the page array, etc. and returns an error expecting that the error handling code will take care of the rest. To make the next patch simpler, only return a fully initialised areas from the function. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/3a602b7fb347dbd4da6797ac49b52ea5dedb856d.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 615805d2c3dd..2f60193365ce 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -207,22 +207,26 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages, 0, (unsigned long)nr_pages << PAGE_SHIFT, GFP_KERNEL_ACCOUNT); - if (ret) { - unpin_user_pages(pages, nr_pages); - kvfree(pages); - return ret; - } + if (ret) + goto out_err; mem->account_pages = io_count_account_pages(pages, nr_pages); ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages); - if (ret < 0) + if (ret < 0) { mem->account_pages = 0; + goto out_err; + } mem->sgt = &mem->page_sg_table; mem->pages = pages; mem->nr_folios = nr_pages; mem->size = area_reg->len; return ret; +out_err: + sg_free_table(&mem->page_sg_table); + unpin_user_pages(pages, nr_pages); + kvfree(pages); + return ret; } static void io_release_area_mem(struct io_zcrx_mem *mem) From b8d6eb6c1c80852dfcad8642f346c26aabf34833 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:43:52 +0000 Subject: [PATCH 31/58] io_uring/zcrx: always dma map in advance zcrx was originally establisihing dma mappings at a late stage when it was being bound to a page pool. Dma-buf couldn't work this way, so it's initialised during area creation. It's messy having them do it at different spots, just move everything to the area creation time. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/334092a2cbdd4aabd7c025050aa99f05ace89bb5.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 44 +++++++++++++++----------------------------- 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 2f60193365ce..a6a08ee48b34 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -194,6 +194,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, { struct page **pages; int nr_pages, ret; + bool mapped = false; if (area_reg->dmabuf_fd) return -EINVAL; @@ -210,6 +211,12 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, if (ret) goto out_err; + ret = dma_map_sgtable(ifq->dev, &mem->page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); + if (ret < 0) + goto out_err; + mapped = true; + mem->account_pages = io_count_account_pages(pages, nr_pages); ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages); if (ret < 0) { @@ -223,6 +230,9 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, mem->size = area_reg->len; return ret; out_err: + if (mapped) + dma_unmap_sgtable(ifq->dev, &mem->page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); sg_free_table(&mem->page_sg_table); unpin_user_pages(pages, nr_pages); kvfree(pages); @@ -288,30 +298,6 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, } } -static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) -{ - int ret; - - guard(mutex)(&ifq->pp_lock); - if (area->is_mapped) - return 0; - - if (!area->mem.is_dmabuf) { - ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table, - DMA_FROM_DEVICE, IO_DMA_ATTR); - if (ret < 0) - return ret; - } - - ret = io_populate_area_dma(ifq, area); - if (ret && !area->mem.is_dmabuf) - dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table, - DMA_FROM_DEVICE, IO_DMA_ATTR); - if (ret == 0) - area->is_mapped = true; - return ret; -} - static void io_zcrx_sync_for_device(struct page_pool *pool, struct net_iov *niov) { @@ -464,6 +450,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, ret = io_import_area(ifq, &area->mem, area_reg); if (ret) goto err; + area->is_mapped = true; if (buf_size_shift > io_area_max_shift(&area->mem)) { ret = -ERANGE; @@ -499,6 +486,10 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, niov->type = NET_IOV_IOURING; } + ret = io_populate_area_dma(ifq, area); + if (ret) + goto err; + area->free_count = nr_iovs; /* we're only supporting one area per ifq for now */ area->area_id = 0; @@ -1082,7 +1073,6 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) static int io_pp_zc_init(struct page_pool *pp) { struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); - int ret; if (WARN_ON_ONCE(!ifq)) return -EINVAL; @@ -1095,10 +1085,6 @@ static int io_pp_zc_init(struct page_pool *pp) if (pp->p.dma_dir != DMA_FROM_DEVICE) return -EOPNOTSUPP; - ret = io_zcrx_map_area(ifq, ifq->area); - if (ret) - return ret; - refcount_inc(&ifq->refs); return 0; } From 06fc3b6d388dfa9c3df62830e07be828324b99e3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:43:53 +0000 Subject: [PATCH 32/58] io_uring/zcrx: extract netdev+area init into a helper In preparation to following patches, add a function that is responsibly for looking up a netdev, creating an area, DMA mapping it and opening a queue. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/88cb6f746ecb496a9030756125419df273d0b003.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 72 +++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index a6a08ee48b34..b0f889b11b73 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -751,10 +751,50 @@ static int import_zcrx(struct io_ring_ctx *ctx, return ret; } +static int zcrx_register_netdev(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_ifq_reg *reg, + struct io_uring_zcrx_area_reg *area) +{ + struct pp_memory_provider_params mp_param = {}; + unsigned if_rxq = reg->if_rxq; + int ret; + + ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, + reg->if_idx); + if (!ifq->netdev) + return -ENODEV; + + netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); + + ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, if_rxq); + if (!ifq->dev) { + ret = -EOPNOTSUPP; + goto netdev_put_unlock; + } + get_device(ifq->dev); + + ret = io_zcrx_create_area(ifq, area, reg); + if (ret) + goto netdev_put_unlock; + + if (reg->rx_buf_len) + mp_param.rx_page_size = 1U << ifq->niov_shift; + mp_param.mp_ops = &io_uring_pp_zc_ops; + mp_param.mp_priv = ifq; + ret = __net_mp_open_rxq(ifq->netdev, if_rxq, &mp_param, NULL); + if (ret) + goto netdev_put_unlock; + + ifq->if_rxq = if_rxq; + ret = 0; +netdev_put_unlock: + netdev_unlock(ifq->netdev); + return ret; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { - struct pp_memory_provider_params mp_param = {}; struct io_uring_zcrx_area_reg area; struct io_uring_zcrx_ifq_reg reg; struct io_uring_region_desc rd; @@ -821,33 +861,9 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (ret) goto err; - ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, reg.if_idx); - if (!ifq->netdev) { - ret = -ENODEV; + ret = zcrx_register_netdev(ifq, ®, &area); + if (ret) goto err; - } - netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); - - ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq); - if (!ifq->dev) { - ret = -EOPNOTSUPP; - goto netdev_put_unlock; - } - get_device(ifq->dev); - - ret = io_zcrx_create_area(ifq, &area, ®); - if (ret) - goto netdev_put_unlock; - - if (reg.rx_buf_len) - mp_param.rx_page_size = 1U << ifq->niov_shift; - mp_param.mp_ops = &io_uring_pp_zc_ops; - mp_param.mp_priv = ifq; - ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL); - if (ret) - goto netdev_put_unlock; - netdev_unlock(ifq->netdev); - ifq->if_rxq = reg.if_rxq; reg.zcrx_id = id; @@ -867,8 +883,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, goto err; } return 0; -netdev_put_unlock: - netdev_unlock(ifq->netdev); err: scoped_guard(mutex, &ctx->mmap_lock) xa_erase(&ctx->zcrx_ctxs, id); From 825f2764919fca61a88ab2f93dfdfd1d22566264 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:43:54 +0000 Subject: [PATCH 33/58] io_uring/zcrx: implement device-less mode for zcrx Allow creating a zcrx instance without attaching it to a net device. All data will be copied through the fallback path. The user is also expected to use ZCRX_CTRL_FLUSH_RQ to handle overflows as it normally should even with a netdev, but it becomes even more relevant as there will likely be no one to automatically pick up buffers. Apart from that, it follows the zcrx uapi for the I/O path, and is useful for testing, experimentation, and potentially for the copy receive path in the future if improved. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/674f8ad679c5a0bc79d538352b3042cf0999596e.1774261953.git.asml.silence@gmail.com [axboe: fix spelling error in uapi header and commit message] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/zcrx.h | 9 ++++++- io_uring/zcrx.c | 41 ++++++++++++++++++++---------- io_uring/zcrx.h | 2 +- 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uring/zcrx.h index 3163a4b8aeb0..5ce02c7a6096 100644 --- a/include/uapi/linux/io_uring/zcrx.h +++ b/include/uapi/linux/io_uring/zcrx.h @@ -49,7 +49,14 @@ struct io_uring_zcrx_area_reg { }; enum zcrx_reg_flags { - ZCRX_REG_IMPORT = 1, + ZCRX_REG_IMPORT = 1, + + /* + * Register a zcrx instance without a net device. All data will be + * copied. The refill queue entries might not be automatically + * consumed and need to be flushed, see ZCRX_CTRL_FLUSH_RQ. + */ + ZCRX_REG_NODEV = 2, }; enum zcrx_features { diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index b0f889b11b73..c753f88b6575 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -127,10 +127,10 @@ static int io_import_dmabuf(struct io_zcrx_ifq *ifq, int dmabuf_fd = area_reg->dmabuf_fd; int i, ret; + if (!ifq->dev) + return -EINVAL; if (off) return -EINVAL; - if (WARN_ON_ONCE(!ifq->dev)) - return -EFAULT; if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) return -EINVAL; @@ -211,11 +211,13 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, if (ret) goto out_err; - ret = dma_map_sgtable(ifq->dev, &mem->page_sg_table, - DMA_FROM_DEVICE, IO_DMA_ATTR); - if (ret < 0) - goto out_err; - mapped = true; + if (ifq->dev) { + ret = dma_map_sgtable(ifq->dev, &mem->page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); + if (ret < 0) + goto out_err; + mapped = true; + } mem->account_pages = io_count_account_pages(pages, nr_pages); ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages); @@ -450,7 +452,8 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, ret = io_import_area(ifq, &area->mem, area_reg); if (ret) goto err; - area->is_mapped = true; + if (ifq->dev) + area->is_mapped = true; if (buf_size_shift > io_area_max_shift(&area->mem)) { ret = -ERANGE; @@ -486,9 +489,11 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, niov->type = NET_IOV_IOURING; } - ret = io_populate_area_dma(ifq, area); - if (ret) - goto err; + if (ifq->dev) { + ret = io_populate_area_dma(ifq, area); + if (ret) + goto err; + } area->free_count = nr_iovs; /* we're only supporting one area per ifq for now */ @@ -826,6 +831,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return -EFAULT; if (reg.if_rxq == -1 || !reg.rq_entries) return -EINVAL; + if ((reg.if_rxq || reg.if_idx) && (reg.flags & ZCRX_REG_NODEV)) + return -EINVAL; if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { if (!(ctx->flags & IORING_SETUP_CLAMP)) return -EINVAL; @@ -861,9 +868,15 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (ret) goto err; - ret = zcrx_register_netdev(ifq, ®, &area); - if (ret) - goto err; + if (!(reg.flags & ZCRX_REG_NODEV)) { + ret = zcrx_register_netdev(ifq, ®, &area); + if (ret) + goto err; + } else { + ret = io_zcrx_create_area(ifq, &area, ®); + if (ret) + goto err; + } reg.zcrx_id = id; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 0316a41a3561..f395656c3160 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -8,7 +8,7 @@ #include #include -#define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT) +#define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT | ZCRX_REG_NODEV) #define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE) struct io_zcrx_mem { From ebae09bce495a0bfbf177f1972411c9a99dfcf07 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:43:55 +0000 Subject: [PATCH 34/58] io_uring/zcrx: use better name for RQ region Rename "region" to "rq_region" to highlight that it's a refill queue region. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/ac815790d2477a15826aecaa3d94f2a94ef507e6.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 8 ++++---- io_uring/zcrx.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index c753f88b6575..e58087073cd5 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -384,11 +384,11 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, mmap_offset = IORING_MAP_OFF_ZCRX_REGION; mmap_offset += id << IORING_OFF_PBUF_SHIFT; - ret = io_create_region(ctx, &ifq->region, rd, mmap_offset); + ret = io_create_region(ctx, &ifq->rq_region, rd, mmap_offset); if (ret < 0) return ret; - ptr = io_region_get_ptr(&ifq->region); + ptr = io_region_get_ptr(&ifq->rq_region); ifq->rq_ring = (struct io_uring *)ptr; ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); @@ -397,7 +397,7 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - io_free_region(ifq->user, &ifq->region); + io_free_region(ifq->user, &ifq->rq_region); ifq->rq_ring = NULL; ifq->rqes = NULL; } @@ -645,7 +645,7 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, lockdep_assert_held(&ctx->mmap_lock); - return ifq ? &ifq->region : NULL; + return ifq ? &ifq->rq_region : NULL; } static int zcrx_box_release(struct inode *inode, struct file *file) diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index f395656c3160..3b2681a1fafd 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -66,7 +66,7 @@ struct io_zcrx_ifq { * net stack. */ struct mutex pp_lock; - struct io_mapped_region region; + struct io_mapped_region rq_region; }; #if defined(CONFIG_IO_URING_ZCRX) From 6a55a0a7ebcc8496c81827a2e9287de80f86dd57 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:43:56 +0000 Subject: [PATCH 35/58] io_uring/zcrx: add a struct for refill queue Add a new structure that keeps the refill queue state. It's cleaner and will be useful once we introduce multiple refill queues. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/4ce200da1ff0309c377293b949200f95f80be9ae.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 54 +++++++++++++++++++++++++------------------------ io_uring/zcrx.h | 14 ++++++++----- 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index e58087073cd5..2112b652a699 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -389,8 +389,8 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, return ret; ptr = io_region_get_ptr(&ifq->rq_region); - ifq->rq_ring = (struct io_uring *)ptr; - ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); + ifq->rq.ring = (struct io_uring *)ptr; + ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off); return 0; } @@ -398,8 +398,8 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { io_free_region(ifq->user, &ifq->rq_region); - ifq->rq_ring = NULL; - ifq->rqes = NULL; + ifq->rq.ring = NULL; + ifq->rq.rqes = NULL; } static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, @@ -519,7 +519,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) return NULL; ifq->if_rxq = -1; - spin_lock_init(&ifq->rq_lock); + spin_lock_init(&ifq->rq.lock); mutex_init(&ifq->pp_lock); refcount_set(&ifq->refs, 1); refcount_set(&ifq->user_refs, 1); @@ -855,7 +855,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, mmgrab(ctx->mm_account); ifq->mm_account = ctx->mm_account; } - ifq->rq_entries = reg.rq_entries; + ifq->rq.nr_entries = reg.rq_entries; scoped_guard(mutex, &ctx->mmap_lock) { /* preallocate id */ @@ -971,20 +971,19 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) xa_destroy(&ctx->zcrx_ctxs); } -static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) +static inline u32 zcrx_rq_entries(struct zcrx_rq *rq) { u32 entries; - entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; - return min(entries, ifq->rq_entries); + entries = smp_load_acquire(&rq->ring->tail) - rq->cached_head; + return min(entries, rq->nr_entries); } -static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, - unsigned mask) +static struct io_uring_zcrx_rqe *zcrx_next_rqe(struct zcrx_rq *rq, unsigned mask) { - unsigned int idx = ifq->cached_rq_head++ & mask; + unsigned int idx = rq->cached_head++ & mask; - return &ifq->rqes[idx]; + return &rq->rqes[idx]; } static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, @@ -1013,18 +1012,19 @@ static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, static void io_zcrx_ring_refill(struct page_pool *pp, struct io_zcrx_ifq *ifq) { - unsigned int mask = ifq->rq_entries - 1; + struct zcrx_rq *rq = &ifq->rq; + unsigned int mask = rq->nr_entries - 1; unsigned int entries; - guard(spinlock_bh)(&ifq->rq_lock); + guard(spinlock_bh)(&rq->lock); - entries = io_zcrx_rqring_entries(ifq); + entries = zcrx_rq_entries(rq); entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL); if (unlikely(!entries)) return; do { - struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); + struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask); struct net_iov *niov; netmem_ref netmem; @@ -1046,7 +1046,7 @@ static void io_zcrx_ring_refill(struct page_pool *pp, net_mp_netmem_place_in_cache(pp, netmem); } while (--entries); - smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); + smp_store_release(&rq->ring->head, rq->cached_head); } static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) @@ -1159,14 +1159,14 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { }; static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, - struct io_zcrx_ifq *zcrx) + struct io_zcrx_ifq *zcrx, struct zcrx_rq *rq) { - unsigned int mask = zcrx->rq_entries - 1; + unsigned int mask = rq->nr_entries - 1; unsigned int i; - nr = min(nr, io_zcrx_rqring_entries(zcrx)); + nr = min(nr, zcrx_rq_entries(rq)); for (i = 0; i < nr; i++) { - struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask); + struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask); struct net_iov *niov; if (!io_parse_rqe(rqe, zcrx, &niov)) @@ -1174,7 +1174,7 @@ static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, netmem_array[i] = net_iov_to_netmem(niov); } - smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head); + smp_store_release(&rq->ring->head, rq->cached_head); return i; } @@ -1208,8 +1208,10 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, return -EINVAL; do { - scoped_guard(spinlock_bh, &zcrx->rq_lock) { - nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx); + struct zcrx_rq *rq = &zcrx->rq; + + scoped_guard(spinlock_bh, &rq->lock) { + nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx, rq); zcrx_return_buffers(netmems, nr); } @@ -1218,7 +1220,7 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, if (fatal_signal_pending(current)) break; cond_resched(); - } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries); + } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq.nr_entries); return 0; } diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 3b2681a1fafd..893cd3708a06 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -41,17 +41,21 @@ struct io_zcrx_area { struct io_zcrx_mem mem; }; +struct zcrx_rq { + spinlock_t lock; + struct io_uring *ring; + struct io_uring_zcrx_rqe *rqes; + u32 cached_head; + u32 nr_entries; +}; + struct io_zcrx_ifq { struct io_zcrx_area *area; unsigned niov_shift; struct user_struct *user; struct mm_struct *mm_account; - spinlock_t rq_lock ____cacheline_aligned_in_smp; - struct io_uring *rq_ring; - struct io_uring_zcrx_rqe *rqes; - u32 cached_rq_head; - u32 rq_entries; + struct zcrx_rq rq ____cacheline_aligned_in_smp; u32 if_rxq; struct device *dev; From 898ad80d1207cbdb22b21bafb6de4adfd7627bd0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:43:57 +0000 Subject: [PATCH 36/58] io_uring/zcrx: use guards for locking Convert last several places using manual locking to guards to simplify the code. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/eb4667cfaf88c559700f6399da9e434889f5b04a.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 2112b652a699..6457690e1af4 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -586,9 +586,8 @@ static void io_zcrx_return_niov_freelist(struct net_iov *niov) { struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - spin_lock_bh(&area->freelist_lock); + guard(spinlock_bh)(&area->freelist_lock); area->freelist[area->free_count++] = net_iov_idx(niov); - spin_unlock_bh(&area->freelist_lock); } static void io_zcrx_return_niov(struct net_iov *niov) @@ -1053,7 +1052,8 @@ static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) { struct io_zcrx_area *area = ifq->area; - spin_lock_bh(&area->freelist_lock); + guard(spinlock_bh)(&area->freelist_lock); + while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { struct net_iov *niov = __io_zcrx_get_free_niov(area); netmem_ref netmem = net_iov_to_netmem(niov); @@ -1062,7 +1062,6 @@ static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) io_zcrx_sync_for_device(pp, niov); net_mp_netmem_place_in_cache(pp, netmem); } - spin_unlock_bh(&area->freelist_lock); } static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) @@ -1285,10 +1284,10 @@ static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq) if (area->mem.is_dmabuf) return NULL; - spin_lock_bh(&area->freelist_lock); - if (area->free_count) - niov = __io_zcrx_get_free_niov(area); - spin_unlock_bh(&area->freelist_lock); + scoped_guard(spinlock_bh, &area->freelist_lock) { + if (area->free_count) + niov = __io_zcrx_get_free_niov(area); + } if (niov) page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); From 7df542a6657534694779948195cc4d36ace575b5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:43:58 +0000 Subject: [PATCH 37/58] io_uring/zcrx: move count check into zcrx_get_free_niov Instead of relying on the caller of __io_zcrx_get_free_niov() to check that there are free niovs available (i.e. free_count > 0), move the check into the function and return NULL if can't allocate. It consolidates the free count checks, and it'll be easier to extend the niov free list allocator in the future. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/6df04a6b3a6170f86d4345da9864f238311163f9.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 6457690e1af4..a7790d609f40 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -590,6 +590,19 @@ static void io_zcrx_return_niov_freelist(struct net_iov *niov) area->freelist[area->free_count++] = net_iov_idx(niov); } +static struct net_iov *zcrx_get_free_niov(struct io_zcrx_area *area) +{ + unsigned niov_idx; + + lockdep_assert_held(&area->freelist_lock); + + if (unlikely(!area->free_count)) + return NULL; + + niov_idx = area->freelist[--area->free_count]; + return &area->nia.niovs[niov_idx]; +} + static void io_zcrx_return_niov(struct net_iov *niov) { netmem_ref netmem = net_iov_to_netmem(niov); @@ -903,16 +916,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return ret; } -static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) -{ - unsigned niov_idx; - - lockdep_assert_held(&area->freelist_lock); - - niov_idx = area->freelist[--area->free_count]; - return &area->nia.niovs[niov_idx]; -} - static inline bool is_zcrx_entry_marked(struct io_ring_ctx *ctx, unsigned long id) { return xa_get_mark(&ctx->zcrx_ctxs, id, XA_MARK_0); @@ -1054,12 +1057,15 @@ static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) guard(spinlock_bh)(&area->freelist_lock); - while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { - struct net_iov *niov = __io_zcrx_get_free_niov(area); - netmem_ref netmem = net_iov_to_netmem(niov); + while (pp->alloc.count < PP_ALLOC_CACHE_REFILL) { + struct net_iov *niov = zcrx_get_free_niov(area); + netmem_ref netmem; + if (!niov) + break; net_mp_niov_set_page_pool(pp, niov); io_zcrx_sync_for_device(pp, niov); + netmem = net_iov_to_netmem(niov); net_mp_netmem_place_in_cache(pp, netmem); } } @@ -1284,10 +1290,8 @@ static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq) if (area->mem.is_dmabuf) return NULL; - scoped_guard(spinlock_bh, &area->freelist_lock) { - if (area->free_count) - niov = __io_zcrx_get_free_niov(area); - } + scoped_guard(spinlock_bh, &area->freelist_lock) + niov = zcrx_get_free_niov(area); if (niov) page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); From 48f253d65d39a45d2eed395bf6b8ac3bb8b1e992 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:43:59 +0000 Subject: [PATCH 38/58] io_uring/zcrx: warn on alloc with non-empty pp cache Page pool ensures the cache is empty before asking to refill it. Warn if the assumption is violated. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/9c9792d6e65f3780d57ff83b6334d341ed9a5f29.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index a7790d609f40..eca30f8461cc 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -1075,8 +1075,8 @@ static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); /* pp should already be ensuring that */ - if (unlikely(pp->alloc.count)) - goto out_return; + if (WARN_ON_ONCE(pp->alloc.count)) + return 0; io_zcrx_ring_refill(pp, ifq); if (likely(pp->alloc.count)) From c0989138c0515fbffbff2d9b9093853a874440cc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:44:00 +0000 Subject: [PATCH 39/58] io_uring/zcrx: netmem array as refiling format Instead of peeking into page pool allocation cache directly or via net_mp_netmem_place_in_cache(), pass a netmem array around. It's a better intermediate format, e.g. you can have it on stack and reuse the refilling code and decouples it from page pools a bit more. It still points into the page pool directly, there will be no additional copies. As the next step, we can change the callback prototype to take the netmem array from page pool. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/9d8549adb7ef6672daf2d8a52858ce5926279a82.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index eca30f8461cc..d52f96508fbe 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -1011,19 +1011,21 @@ static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, return true; } -static void io_zcrx_ring_refill(struct page_pool *pp, - struct io_zcrx_ifq *ifq) +static unsigned io_zcrx_ring_refill(struct page_pool *pp, + struct io_zcrx_ifq *ifq, + netmem_ref *netmems, unsigned to_alloc) { struct zcrx_rq *rq = &ifq->rq; unsigned int mask = rq->nr_entries - 1; unsigned int entries; + unsigned allocated = 0; guard(spinlock_bh)(&rq->lock); entries = zcrx_rq_entries(rq); - entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL); + entries = min_t(unsigned, entries, to_alloc); if (unlikely(!entries)) - return; + return 0; do { struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask); @@ -1045,48 +1047,56 @@ static void io_zcrx_ring_refill(struct page_pool *pp, } io_zcrx_sync_for_device(pp, niov); - net_mp_netmem_place_in_cache(pp, netmem); + netmems[allocated] = netmem; + allocated++; } while (--entries); smp_store_release(&rq->ring->head, rq->cached_head); + return allocated; } -static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) +static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq, + netmem_ref *netmems, unsigned to_alloc) { struct io_zcrx_area *area = ifq->area; + unsigned allocated = 0; guard(spinlock_bh)(&area->freelist_lock); - while (pp->alloc.count < PP_ALLOC_CACHE_REFILL) { + for (allocated = 0; allocated < to_alloc; allocated++) { struct net_iov *niov = zcrx_get_free_niov(area); - netmem_ref netmem; if (!niov) break; net_mp_niov_set_page_pool(pp, niov); io_zcrx_sync_for_device(pp, niov); - netmem = net_iov_to_netmem(niov); - net_mp_netmem_place_in_cache(pp, netmem); + netmems[allocated] = net_iov_to_netmem(niov); } + return allocated; } static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) { struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); + netmem_ref *netmems = pp->alloc.cache; + unsigned to_alloc = PP_ALLOC_CACHE_REFILL; + unsigned allocated; /* pp should already be ensuring that */ if (WARN_ON_ONCE(pp->alloc.count)) return 0; - io_zcrx_ring_refill(pp, ifq); - if (likely(pp->alloc.count)) + allocated = io_zcrx_ring_refill(pp, ifq, netmems, to_alloc); + if (likely(allocated)) goto out_return; - io_zcrx_refill_slow(pp, ifq); - if (!pp->alloc.count) + allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc); + if (!allocated) return 0; out_return: - return pp->alloc.cache[--pp->alloc.count]; + allocated--; + pp->alloc.count += allocated; + return netmems[allocated]; } static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) From 61cfadaae6612830b1d4c3457a9935d362af8839 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:44:01 +0000 Subject: [PATCH 40/58] io_uring/zcrx: consolidate dma syncing Split refilling into two steps, first allocate niovs, and then do DMA sync for them. This way dma synchronisation code can be better optimised. E.g. we don't need to call dma_dev_need_sync() for each every niov, and maybe we can coalesce sync for adjacent netmems in the future as well. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/19f2d50baa62ff2e0c6cd56dd7c394cab728c567.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index d52f96508fbe..77be6fb32df6 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -300,21 +300,23 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, } } -static void io_zcrx_sync_for_device(struct page_pool *pool, - struct net_iov *niov) +static void zcrx_sync_for_device(struct page_pool *pp, struct io_zcrx_ifq *zcrx, + netmem_ref *netmems, unsigned nr) { #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) + struct device *dev = pp->p.dev; + unsigned i, niov_size; dma_addr_t dma_addr; - unsigned niov_size; - - if (!dma_dev_need_sync(pool->p.dev)) + if (!dma_dev_need_sync(dev)) return; + niov_size = 1U << zcrx->niov_shift; - niov_size = 1U << io_pp_to_ifq(pool)->niov_shift; - dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); - __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, - niov_size, pool->p.dma_dir); + for (i = 0; i < nr; i++) { + dma_addr = page_pool_get_dma_addr_netmem(netmems[i]); + __dma_sync_single_for_device(dev, dma_addr + pp->p.offset, + niov_size, pp->p.dma_dir); + } #endif } @@ -1046,7 +1048,6 @@ static unsigned io_zcrx_ring_refill(struct page_pool *pp, continue; } - io_zcrx_sync_for_device(pp, niov); netmems[allocated] = netmem; allocated++; } while (--entries); @@ -1069,7 +1070,6 @@ static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *if if (!niov) break; net_mp_niov_set_page_pool(pp, niov); - io_zcrx_sync_for_device(pp, niov); netmems[allocated] = net_iov_to_netmem(niov); } return allocated; @@ -1094,6 +1094,7 @@ static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) if (!allocated) return 0; out_return: + zcrx_sync_for_device(pp, ifq, netmems, allocated); allocated--; pp->alloc.count += allocated; return netmems[allocated]; From f0b92207a00c731cfbfdefdcf9f9350a11e30ab3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:44:02 +0000 Subject: [PATCH 41/58] io_uring/zcrx: warn on a repeated area append We only support a single area, no path should be able to call io_zcrx_append_area() twice. Warn if that happens instead of just returning an error. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/28eb67fb8c48445584d7c247a36e1ad8800f0c8b.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 77be6fb32df6..e637052b645a 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -423,7 +423,7 @@ static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, static int io_zcrx_append_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { - if (ifq->area) + if (WARN_ON_ONCE(ifq->area)) return -EINVAL; ifq->area = area; return 0; From 5c727ce042988df45232cfdb6599bb46116fd69c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:44:03 +0000 Subject: [PATCH 42/58] io_uring/zcrx: cache fallback availability in zcrx ctx Store a flag in struct io_zcrx_ifq telling if the backing memory is normal page or dmabuf based. It was looking it up from the area, however it logically allocates from the zcrx ctx and not a particular area, and once we add more than one area it'll become a mess. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/65e75408a7758fe7e60fae89b7a8d5ae4857f515.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 9 ++++++++- io_uring/zcrx.h | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index e637052b645a..32829775fb6b 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -423,8 +423,13 @@ static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, static int io_zcrx_append_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { + bool kern_readable = !area->mem.is_dmabuf; + if (WARN_ON_ONCE(ifq->area)) return -EINVAL; + if (WARN_ON_ONCE(ifq->kern_readable != kern_readable)) + return -EINVAL; + ifq->area = area; return 0; } @@ -882,6 +887,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (ret) goto err; + ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF); + if (!(reg.flags & ZCRX_REG_NODEV)) { ret = zcrx_register_netdev(ifq, ®, &area); if (ret) @@ -1298,7 +1305,7 @@ static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq) struct io_zcrx_area *area = ifq->area; struct net_iov *niov = NULL; - if (area->mem.is_dmabuf) + if (!ifq->kern_readable) return NULL; scoped_guard(spinlock_bh, &area->freelist_lock) diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 893cd3708a06..3e07238a4eb0 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -54,6 +54,7 @@ struct io_zcrx_ifq { unsigned niov_shift; struct user_struct *user; struct mm_struct *mm_account; + bool kern_readable; struct zcrx_rq rq ____cacheline_aligned_in_smp; From de6ed1b323fc50eaa3d7847274cff51055b5c498 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:44:04 +0000 Subject: [PATCH 43/58] io_uring/zcrx: check ctrl op payload struct sizes Add a build check that ctrl payloads are of the same size and don't grow struct zcrx_ctrl. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/af66caf9776d18e9ff880ab828eb159a6a03caf5.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 32829775fb6b..e2e0df78dae1 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -1253,6 +1253,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) struct zcrx_ctrl ctrl; struct io_zcrx_ifq *zcrx; + BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush)); + if (nr_args) return -EINVAL; if (copy_from_user(&ctrl, arg, sizeof(ctrl))) From 7c713dd0078651d040a0251eab6e29e3c2e4ee11 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Mar 2026 12:44:05 +0000 Subject: [PATCH 44/58] io_uring/zcrx: rename zcrx [un]register functions Drop "ifqs" from function names, as it refers to an interface queue and there might be none once a device-less mode is introduced. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/657874acd117ec30fa6f45d9d844471c753b5a0f.1774261953.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/register.c | 2 +- io_uring/zcrx.c | 6 +++--- io_uring/zcrx.h | 10 +++++----- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 34104c256c88..16122f877aed 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2156,7 +2156,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); io_sqe_buffers_unregister(ctx); io_sqe_files_unregister(ctx); - io_unregister_zcrx_ifqs(ctx); + io_unregister_zcrx(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); io_free_alloc_caches(ctx); diff --git a/io_uring/register.c b/io_uring/register.c index 489a6feaf228..35432471a550 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -900,7 +900,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = -EINVAL; if (!arg || nr_args != 1) break; - ret = io_register_zcrx_ifq(ctx, arg); + ret = io_register_zcrx(ctx, arg); break; case IORING_REGISTER_RESIZE_RINGS: ret = -EINVAL; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index e2e0df78dae1..f94f74d0f566 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -816,8 +816,8 @@ static int zcrx_register_netdev(struct io_zcrx_ifq *ifq, return ret; } -int io_register_zcrx_ifq(struct io_ring_ctx *ctx, - struct io_uring_zcrx_ifq_reg __user *arg) +int io_register_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) { struct io_uring_zcrx_area_reg area; struct io_uring_zcrx_ifq_reg reg; @@ -955,7 +955,7 @@ void io_terminate_zcrx(struct io_ring_ctx *ctx) } } -void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +void io_unregister_zcrx(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 3e07238a4eb0..75e0a4e6ef6e 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -76,9 +76,9 @@ struct io_zcrx_ifq { #if defined(CONFIG_IO_URING_ZCRX) int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg); -int io_register_zcrx_ifq(struct io_ring_ctx *ctx, +int io_register_zcrx(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); -void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); +void io_unregister_zcrx(struct io_ring_ctx *ctx); void io_terminate_zcrx(struct io_ring_ctx *ctx); int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct socket *sock, unsigned int flags, @@ -86,12 +86,12 @@ int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id); #else -static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, - struct io_uring_zcrx_ifq_reg __user *arg) +static inline int io_register_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) { return -EOPNOTSUPP; } -static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +static inline void io_unregister_zcrx(struct io_ring_ctx *ctx) { } static inline void io_terminate_zcrx(struct io_ring_ctx *ctx) From 19a8cc6cda580a3726ab8f117e7c6de507376d9b Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Tue, 31 Mar 2026 18:45:09 +0800 Subject: [PATCH 45/58] io_uring/rsrc: use io_cache_free() to free node Replace kfree(node) with io_cache_free() in io_buffer_register_bvec() to match all other error paths that free nodes allocated via io_rsrc_node_alloc(). The node is allocated through io_cache_alloc() internally, so it should be returned to the cache via io_cache_free() for proper object reuse. Signed-off-by: Jackie Liu Link: https://patch.msgid.link/20260331104509.7055-1-liu.yun@linux.dev [axboe: remove fixes tag, it's not a fix, it's a cleanup] Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 52554ed89b11..2d8be5edbbf6 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -961,7 +961,7 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, */ imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); if (!imu) { - kfree(node); + io_cache_free(&ctx->node_cache, node); ret = -ENOMEM; goto unlock; } From 85a58309c0d5b5f5a4b65658312ceaf2c34c9bbf Mon Sep 17 00:00:00 2001 From: Amir Mohammad Jahangirzad Date: Wed, 1 Apr 2026 02:51:13 +0330 Subject: [PATCH 46/58] io_uring/cancel: validate opcode for IORING_ASYNC_CANCEL_OP io_async_cancel_prep() reads the opcode selector from sqe->len and stores it in cancel->opcode, which is an 8-bit field. Since sqe->len is a 32-bit value, values larger than U8_MAX are implicitly truncated. This can cause unintended opcode matches when the truncated value corresponds to a valid io_uring opcode. For example, submitting a value such as 0x10b will be truncated to 0x0b (IORING_OP_TIMEOUT), allowing a cancel request to match operations it did not intend to target. Validate the opcode value before assigning it to the 8-bit field and reject values outside the valid io_uring opcode range. Signed-off-by: Amir Mohammad Jahangirzad Link: https://patch.msgid.link/20260331232113.615972-1-a.jahangirzad@gmail.com Signed-off-by: Jens Axboe --- io_uring/cancel.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 65e04063e343..5e5eb9cfc7cd 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -156,9 +156,16 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) cancel->fd = READ_ONCE(sqe->fd); } if (cancel->flags & IORING_ASYNC_CANCEL_OP) { + u32 op; + if (cancel->flags & IORING_ASYNC_CANCEL_ANY) return -EINVAL; - cancel->opcode = READ_ONCE(sqe->len); + + op = READ_ONCE(sqe->len); + if (op >= IORING_OP_LAST) + return -EINVAL; + + cancel->opcode = op; } return 0; From a9d008489f0c5304ca7f705348324e47824a7454 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 31 Mar 2026 22:07:38 +0100 Subject: [PATCH 47/58] io_uring/zcrx: reject REG_NODEV with large rx_buf_size The copy fallback path doesn't care about the actual niov size and only uses first PAGE_SIZE bytes, and any additional space will be wasted. Since ZCRX_REG_NODEV solely relies on the copy path, it doesn't make sense to support non-standard rx_buf_len. Reject it for now, and re-enable once improved. Fixes: c11728021d5cd ("io_uring/zcrx: implement device-less mode for zcrx") Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/3e7652d9c27f8ac5d2b141e3af47971f2771fb05.1774780198.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index f94f74d0f566..1ce867c68446 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -449,6 +449,8 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, return -EINVAL; buf_size_shift = ilog2(reg->rx_buf_len); } + if (!ifq->dev && buf_size_shift != PAGE_SHIFT) + return -EOPNOTSUPP; ret = -ENOMEM; area = kzalloc_obj(*area); @@ -462,7 +464,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, if (ifq->dev) area->is_mapped = true; - if (buf_size_shift > io_area_max_shift(&area->mem)) { + if (ifq->dev && buf_size_shift > io_area_max_shift(&area->mem)) { ret = -ERANGE; goto err; } From 77d8c8d0f1b76a005267ee9714ed98964c87ecc5 Mon Sep 17 00:00:00 2001 From: Anas Iqbal Date: Tue, 31 Mar 2026 22:07:39 +0100 Subject: [PATCH 48/58] io_uring: cast id to u64 before shifting in io_allocate_rbuf_ring() Smatch warns: io_uring/zcrx.c:393 io_allocate_rbuf_ring() warn: should 'id << 16' be a 64 bit type? The expression 'id << IORING_OFF_PBUF_SHIFT' is evaluated using 32-bit arithmetic because id is a u32. This may overflow before being promoted to the 64-bit mmap_offset. Cast id to u64 before shifting to ensure the shift is performed in 64-bit arithmetic. Signed-off-by: Anas Iqbal Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/52400e1b343691416bef3ed3ae287fb1a88d407f.1774780198.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 1ce867c68446..b8f15439d5df 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -384,7 +384,7 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, return -EINVAL; mmap_offset = IORING_MAP_OFF_ZCRX_REGION; - mmap_offset += id << IORING_OFF_PBUF_SHIFT; + mmap_offset += (u64)id << IORING_OFF_PBUF_SHIFT; ret = io_create_region(ctx, &ifq->rq_region, rd, mmap_offset); if (ret < 0) From 8ae2837d5a97644b729a889951127da98111a32d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 31 Mar 2026 22:07:40 +0100 Subject: [PATCH 49/58] io_uring/zcrx: don't use mark0 for allocating xarray XA_MARK_0 is not compatible with xarray allocating entries, use XA_MARK_1. Fixes: fda90d43f4fac ("io_uring/zcrx: return back two step unregistration") Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/f232cfd3c466047d333b474dd2bddd246b6ebb82.1774780198.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index b8f15439d5df..5c0a49340722 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -929,12 +929,12 @@ int io_register_zcrx(struct io_ring_ctx *ctx, static inline bool is_zcrx_entry_marked(struct io_ring_ctx *ctx, unsigned long id) { - return xa_get_mark(&ctx->zcrx_ctxs, id, XA_MARK_0); + return xa_get_mark(&ctx->zcrx_ctxs, id, XA_MARK_1); } static inline void set_zcrx_entry_mark(struct io_ring_ctx *ctx, unsigned long id) { - xa_set_mark(&ctx->zcrx_ctxs, id, XA_MARK_0); + xa_set_mark(&ctx->zcrx_ctxs, id, XA_MARK_1); } void io_terminate_zcrx(struct io_ring_ctx *ctx) From 52dcd1776bed614c6a270d9237df6105feab4c14 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 31 Mar 2026 22:07:41 +0100 Subject: [PATCH 50/58] io_uring/zcrx: don't clear not allocated niovs Now that area->is_mapped is set earlier before niovs array is allocated, io_zcrx_free_area -> io_zcrx_unmap_area in an error path can try to clear dma addresses for unallocated niovs, fix it. Fixes: 8c0cab0b7bf7 ("io_uring/zcrx: always dma map in advance") Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/cbcb7749b5a001ecd4d1c303515ce9403215640c.1774780198.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 5c0a49340722..d84ad40eae49 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -289,8 +289,10 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, return; area->is_mapped = false; - for (i = 0; i < area->nia.num_niovs; i++) - net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); + if (area->nia.niovs) { + for (i = 0; i < area->nia.num_niovs; i++) + net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); + } if (area->mem.is_dmabuf) { io_release_dmabuf(&area->mem); From 7120b87bed922ae2f1968c081377162380e1547e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 31 Mar 2026 22:07:42 +0100 Subject: [PATCH 51/58] io_uring/zcrx: use dma_len for chunk size calculation Buffers are now dma-mapped earlier and we can sg_dma_len(), otherwise, since it's walking with for_each_sgtable_dma_sg(), it might wrongfully reject some configurations. As a bonus, it'd now be able to use larger chunks if dma addresses are coalesced e.g by iommu. Fixes: 8c0cab0b7bf7 ("io_uring/zcrx: always dma map in advance") Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/03b219af3f6cfdd1cf64679b8bab7461e47cc123.1774780198.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index d84ad40eae49..3bf800426fd2 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -63,7 +63,7 @@ static int io_area_max_shift(struct io_zcrx_mem *mem) unsigned i; for_each_sgtable_dma_sg(sgt, sg, i) - shift = min(shift, __ffs(sg->length)); + shift = min(shift, __ffs(sg_dma_len(sg))); return shift; } From 4c6f93951b8fc556f2a37d45b32cb7f7e76b0e91 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 31 Mar 2026 22:07:43 +0100 Subject: [PATCH 52/58] io_uring/zcrx: use correct mmap off constants zcrx was using IORING_OFF_PBUF_SHIFT during first iterations, but there is now a separate constant it should use. Both are 16 so it doesn't change anything, but improve it for the future. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/fe16ebe9ba4048a7e12f9b3b50880bd175b1ce03.1774780198.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 3bf800426fd2..bd970fb084c1 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -386,7 +386,7 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, return -EINVAL; mmap_offset = IORING_MAP_OFF_ZCRX_REGION; - mmap_offset += (u64)id << IORING_OFF_PBUF_SHIFT; + mmap_offset += (u64)id << IORING_OFF_ZCRX_SHIFT; ret = io_create_region(ctx, &ifq->rq_region, rd, mmap_offset); if (ret < 0) From c7f3aaf3e835f2dc0f3f293ae3739b844b909595 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Apr 2026 10:35:11 -0700 Subject: [PATCH 53/58] io_uring/rw: clean up __io_read() obsolete comment and early returns After commit a9165b83c193 ("io_uring/rw: always setup io_async_rw for read/write requests") which moved the iovec allocation into the prep path and stores it in req->async_data where it now gets freed as part of the request lifecycle, this comment is now outdated. Remove it and clean up the goto as well. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20260401173511.4052303-1-joannelkoong@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 046f76a71b9c..20654deff84d 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -962,13 +962,13 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, if (ret == -EAGAIN) { /* If we can poll, just do that. */ if (io_file_can_poll(req)) - return -EAGAIN; + return ret; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->flags & REQ_F_IOPOLL)) - goto done; + return ret; /* no retry on NONBLOCK nor RWF_NOWAIT */ if (req->flags & REQ_F_NOWAIT) - goto done; + return ret; ret = 0; } else if (ret == -EIOCBQUEUED) { return IOU_ISSUE_SKIP_COMPLETE; @@ -976,7 +976,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) || (issue_flags & IO_URING_F_MULTISHOT)) { /* read all, failed, already did sync or don't want to retry */ - goto done; + return ret; } /* @@ -1019,8 +1019,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, kiocb->ki_flags &= ~IOCB_WAITQ; iov_iter_restore(&io->iter, &io->iter_state); } while (ret > 0); -done: - /* it's faster to check here than delegate to kfree */ + return ret; } From f847bf6d29304087f94ef4b4a8646f69d96945f9 Mon Sep 17 00:00:00 2001 From: Yang Xiuwei Date: Thu, 2 Apr 2026 09:49:52 +0800 Subject: [PATCH 54/58] io_uring/timeout: use 'ctx' consistently There's already a local ctx variable, yet cq_timeouts accounting uses req->ctx. Use ctx consistently. Signed-off-by: Yang Xiuwei Link: https://patch.msgid.link/20260402014952.260414-1-yangxiuwei@kylinos.cn Signed-off-by: Jens Axboe --- io_uring/timeout.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 579fdddac71a..4cfdfc519770 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -284,8 +284,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) raw_spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&timeout->list); - atomic_set(&req->ctx->cq_timeouts, - atomic_read(&req->ctx->cq_timeouts) + 1); + atomic_set(&ctx->cq_timeouts, + atomic_read(&ctx->cq_timeouts) + 1); raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) From 2c453a4281245135b9e6f1048962272c74853b53 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Apr 2026 11:31:05 -0600 Subject: [PATCH 55/58] io_uring/tctx: have io_uring_alloc_task_context() return tctx Instead of having io_uring_alloc_task_context() return an int and assign tsk->io_uring, just have it return the task context directly. This enables cleaner error handling in callers, which may have failure points post calling io_uring_alloc_task_context(). Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 8 +++++++- io_uring/tctx.c | 21 ++++++++++----------- io_uring/tctx.h | 4 ++-- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index c6bb938ec5ea..46c12afec73e 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -458,6 +458,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, return -EINVAL; } if (ctx->flags & IORING_SETUP_SQPOLL) { + struct io_uring_task *tctx; struct task_struct *tsk; struct io_sq_data *sqd; bool attached; @@ -524,8 +525,13 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, rcu_assign_pointer(sqd->thread, tsk); mutex_unlock(&sqd->lock); + ret = 0; get_task_struct(tsk); - ret = io_uring_alloc_task_context(tsk, ctx); + tctx = io_uring_alloc_task_context(tsk, ctx); + if (!IS_ERR(tctx)) + tsk->io_uring = tctx; + else + ret = PTR_ERR(tctx); wake_up_new_task(tsk); if (ret) goto err; diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 143de8e990eb..e5cef6a8dde0 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -74,20 +74,20 @@ void __io_uring_free(struct task_struct *tsk) } } -__cold int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx) +__cold struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx) { struct io_uring_task *tctx; int ret; tctx = kzalloc_obj(*tctx); if (unlikely(!tctx)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); if (unlikely(ret)) { kfree(tctx); - return ret; + return ERR_PTR(ret); } tctx->io_wq = io_init_wq_offload(ctx, task); @@ -95,7 +95,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, ret = PTR_ERR(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); kfree(tctx); - return ret; + return ERR_PTR(ret); } tctx->task = task; @@ -103,10 +103,9 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_cancel, 0); atomic_set(&tctx->inflight_tracked, 0); - task->io_uring = tctx; init_llist_head(&tctx->task_list); init_task_work(&tctx->task_work, tctx_task_work); - return 0; + return tctx; } int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) @@ -116,11 +115,11 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) int ret; if (unlikely(!tctx)) { - ret = io_uring_alloc_task_context(current, ctx); - if (unlikely(ret)) - return ret; + tctx = io_uring_alloc_task_context(current, ctx); + if (IS_ERR(tctx)) + return PTR_ERR(tctx); - tctx = current->io_uring; + current->io_uring = tctx; if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) { unsigned int limits[2] = { ctx->iowq_limits[0], ctx->iowq_limits[1], }; diff --git a/io_uring/tctx.h b/io_uring/tctx.h index 608e96de70a2..2310d2a0c46d 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -6,8 +6,8 @@ struct io_tctx_node { struct io_ring_ctx *ctx; }; -int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx); +struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx); void io_uring_del_tctx_node(unsigned long index); int __io_uring_add_tctx_node(struct io_ring_ctx *ctx); int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx); From 7880174e1e5e88944ea75cf871efd77ec5e3ef51 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Apr 2026 11:31:38 -0600 Subject: [PATCH 56/58] io_uring/tctx: clean up __io_uring_add_tctx_node() error handling Refactor __io_uring_add_tctx_node() so that on error it never leaves current->io_uring pointing at a half-setup tctx. This moves the assignment of current->io_uring to the end of the function post any failure points. Separate out the node installation into io_tctx_install_node() to further clean this up. Signed-off-by: Jens Axboe --- io_uring/tctx.c | 60 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index e5cef6a8dde0..61533f30494f 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -108,10 +108,37 @@ __cold struct io_uring_task *io_uring_alloc_task_context(struct task_struct *tas return tctx; } +static int io_tctx_install_node(struct io_ring_ctx *ctx, + struct io_uring_task *tctx) +{ + struct io_tctx_node *node; + int ret; + + if (xa_load(&tctx->xa, (unsigned long)ctx)) + return 0; + + node = kmalloc_obj(*node); + if (!node) + return -ENOMEM; + node->ctx = ctx; + node->task = current; + + ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, + node, GFP_KERNEL)); + if (ret) { + kfree(node); + return ret; + } + + mutex_lock(&ctx->tctx_lock); + list_add(&node->ctx_node, &ctx->tctx_list); + mutex_unlock(&ctx->tctx_lock); + return 0; +} + int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; - struct io_tctx_node *node; int ret; if (unlikely(!tctx)) { @@ -119,14 +146,13 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) if (IS_ERR(tctx)) return PTR_ERR(tctx); - current->io_uring = tctx; if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) { unsigned int limits[2] = { ctx->iowq_limits[0], ctx->iowq_limits[1], }; ret = io_wq_max_workers(tctx->io_wq, limits); if (ret) - return ret; + goto err_free; } } @@ -137,25 +163,19 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) */ if (tctx->io_wq) io_wq_set_exit_on_idle(tctx->io_wq, false); - if (!xa_load(&tctx->xa, (unsigned long)ctx)) { - node = kmalloc_obj(*node); - if (!node) - return -ENOMEM; - node->ctx = ctx; - node->task = current; - ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, - node, GFP_KERNEL)); - if (ret) { - kfree(node); - return ret; - } - - mutex_lock(&ctx->tctx_lock); - list_add(&node->ctx_node, &ctx->tctx_list); - mutex_unlock(&ctx->tctx_lock); + ret = io_tctx_install_node(ctx, tctx); + if (!ret) { + current->io_uring = tctx; + return 0; } - return 0; + if (!current->io_uring) { +err_free: + io_wq_put_and_exit(tctx->io_wq); + percpu_counter_destroy(&tctx->inflight); + kfree(tctx); + } + return ret; } int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx) From b4d893d636f435701f025e43146d0a4b9a065102 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Apr 2026 11:50:08 -0600 Subject: [PATCH 57/58] io_uring/register: don't get a reference to the registered ring fd This isn't necessary and was only done because the register path isn't a hot path and hence the extra ref/put doesn't matter, and to have the exit path be able to unconditionally put whatever file was gotten regardless of the type. In preparation for sharing this code with the main io_uring_enter(2) syscall, drop the reference and have the caller conditionally put the file if it was a normal file descriptor. Signed-off-by: Jens Axboe --- io_uring/register.c | 8 ++++---- io_uring/rsrc.c | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index 35432471a550..95cfa88dc621 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -941,7 +941,8 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, /* * Given an 'fd' value, return the ctx associated with if. If 'registered' is * true, then the registered index is used. Otherwise, the normal fd table. - * Caller must call fput() on the returned file, unless it's an ERR_PTR. + * Caller must call fput() on the returned file if it isn't a registered file, + * unless it's an ERR_PTR. */ struct file *io_uring_register_get_file(unsigned int fd, bool registered) { @@ -958,8 +959,6 @@ struct file *io_uring_register_get_file(unsigned int fd, bool registered) return ERR_PTR(-EINVAL); fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); file = tctx->registered_rings[fd]; - if (file) - get_file(file); } else { file = fget(fd); } @@ -1038,6 +1037,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, ctx->buf_table.nr, ret); mutex_unlock(&ctx->uring_lock); - fput(file); + if (!use_registered_ring) + fput(file); return ret; } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 2d8be5edbbf6..cb12194b35e8 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1291,7 +1291,8 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) if (src_ctx != ctx) mutex_unlock(&src_ctx->uring_lock); - fput(file); + if (!registered_src) + fput(file); return ret; } From c5e9f6a96bf7379da87df1b852b90527e242b56f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Apr 2026 11:56:02 -0600 Subject: [PATCH 58/58] io_uring: unify getting ctx from passed in file descriptor io_uring_enter() and io_uring_register() end up having duplicated code for getting a ctx from a passed in file descriptor, for either a registered ring descriptor or a normal file descriptor. Move the io_uring_register_get_file() into io_uring.c and name it a bit more generically, and use it from both callsites rather than have that logic and handling duplicated. Signed-off-by: Jens Axboe --- io_uring/bpf-ops.c | 2 +- io_uring/io_uring.c | 59 ++++++++++++++++++++++++++++----------------- io_uring/io_uring.h | 1 + io_uring/register.c | 35 +-------------------------- io_uring/register.h | 1 - io_uring/rsrc.c | 2 +- 6 files changed, 41 insertions(+), 59 deletions(-) diff --git a/io_uring/bpf-ops.c b/io_uring/bpf-ops.c index e4b244337aa9..937e48bef40b 100644 --- a/io_uring/bpf-ops.c +++ b/io_uring/bpf-ops.c @@ -181,7 +181,7 @@ static int bpf_io_reg(void *kdata, struct bpf_link *link) struct file *file; int ret = -EBUSY; - file = io_uring_register_get_file(ops->ring_fd, false); + file = io_uring_ctx_get_file(ops->ring_fd, false); if (IS_ERR(file)) return PTR_ERR(file); ctx = file->private_data; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 16122f877aed..003f0e081d92 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2543,6 +2543,40 @@ static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags, #endif } +/* + * Given an 'fd' value, return the ctx associated with if. If 'registered' is + * true, then the registered index is used. Otherwise, the normal fd table. + * Caller must call fput() on the returned file if it isn't a registered file, + * unless it's an ERR_PTR. + */ +struct file *io_uring_ctx_get_file(unsigned int fd, bool registered) +{ + struct file *file; + + if (registered) { + /* + * Ring fd has been registered via IORING_REGISTER_RING_FDS, we + * need only dereference our task private array to find it. + */ + struct io_uring_task *tctx = current->io_uring; + + if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) + return ERR_PTR(-EINVAL); + fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); + file = tctx->registered_rings[fd]; + } else { + file = fget(fd); + } + + if (unlikely(!file)) + return ERR_PTR(-EBADF); + if (io_is_uring_fops(file)) + return file; + fput(file); + return ERR_PTR(-EOPNOTSUPP); +} + + SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32, min_complete, u32, flags, const void __user *, argp, size_t, argsz) @@ -2554,28 +2588,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(flags & ~IORING_ENTER_FLAGS)) return -EINVAL; - /* - * Ring fd has been registered via IORING_REGISTER_RING_FDS, we - * need only dereference our task private array to find it. - */ - if (flags & IORING_ENTER_REGISTERED_RING) { - struct io_uring_task *tctx = current->io_uring; - - if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) - return -EINVAL; - fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); - file = tctx->registered_rings[fd]; - if (unlikely(!file)) - return -EBADF; - } else { - file = fget(fd); - if (unlikely(!file)) - return -EBADF; - ret = -EOPNOTSUPP; - if (unlikely(!io_is_uring_fops(file))) - goto out; - } - + file = io_uring_ctx_get_file(fd, flags & IORING_ENTER_REGISTERED_RING); + if (IS_ERR(file)) + return PTR_ERR(file); ctx = file->private_data; ret = -EBADFD; /* diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 91cf67b5d85b..e43995682c8b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -173,6 +173,7 @@ void io_req_track_inflight(struct io_kiocb *req); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); +struct file *io_uring_ctx_get_file(unsigned int fd, bool registered); void io_req_task_queue(struct io_kiocb *req); void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw); diff --git a/io_uring/register.c b/io_uring/register.c index 95cfa88dc621..6260196929a7 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -938,39 +938,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, return ret; } -/* - * Given an 'fd' value, return the ctx associated with if. If 'registered' is - * true, then the registered index is used. Otherwise, the normal fd table. - * Caller must call fput() on the returned file if it isn't a registered file, - * unless it's an ERR_PTR. - */ -struct file *io_uring_register_get_file(unsigned int fd, bool registered) -{ - struct file *file; - - if (registered) { - /* - * Ring fd has been registered via IORING_REGISTER_RING_FDS, we - * need only dereference our task private array to find it. - */ - struct io_uring_task *tctx = current->io_uring; - - if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) - return ERR_PTR(-EINVAL); - fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); - file = tctx->registered_rings[fd]; - } else { - file = fget(fd); - } - - if (unlikely(!file)) - return ERR_PTR(-EBADF); - if (io_is_uring_fops(file)) - return file; - fput(file); - return ERR_PTR(-EOPNOTSUPP); -} - static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args) { struct io_uring_sqe sqe; @@ -1025,7 +992,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, if (fd == -1) return io_uring_register_blind(opcode, arg, nr_args); - file = io_uring_register_get_file(fd, use_registered_ring); + file = io_uring_ctx_get_file(fd, use_registered_ring); if (IS_ERR(file)) return PTR_ERR(file); ctx = file->private_data; diff --git a/io_uring/register.h b/io_uring/register.h index a5f39d5ef9e0..c9da997d503c 100644 --- a/io_uring/register.h +++ b/io_uring/register.h @@ -4,6 +4,5 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); -struct file *io_uring_register_get_file(unsigned int fd, bool registered); #endif diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index cb12194b35e8..57151c01da0f 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1269,7 +1269,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) return -EINVAL; registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; - file = io_uring_register_get_file(buf.src_fd, registered_src); + file = io_uring_ctx_get_file(buf.src_fd, registered_src); if (IS_ERR(file)) return PTR_ERR(file);