From a08d195b586a217d76b42062f88f375a3eedda4d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 11 Sep 2023 13:31:56 -0600
Subject: [PATCH 01/14] io_uring/rw: split io_read() into a helper

Add __io_read() which does the grunt of the work, leaving the completion
side to the new io_read(). No functional changes in this patch.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rw.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/io_uring/rw.c b/io_uring/rw.c
index c8c822fa7980..5c58962d73da 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -708,7 +708,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 	return 0;
 }
 
-int io_read(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct io_rw_state __s, *s = &__s;
@@ -853,7 +853,18 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
 	/* it's faster to check here then delegate to kfree */
 	if (iovec)
 		kfree(iovec);
-	return kiocb_done(req, ret, issue_flags);
+	return ret;
+}
+
+int io_read(struct io_kiocb *req, unsigned int issue_flags)
+{
+	int ret;
+
+	ret = __io_read(req, issue_flags);
+	if (ret >= 0)
+		return kiocb_done(req, ret, issue_flags);
+
+	return ret;
 }
 
 int io_write(struct io_kiocb *req, unsigned int issue_flags)

From d2d778fbf9964e4e5b8d7420eba8ec5ce938e794 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 11 Sep 2023 13:46:07 -0600
Subject: [PATCH 02/14] io_uring/rw: mark readv/writev as vectored in the
 opcode definition

This is cleaner than gating on the opcode type, particularly as more
read/write type opcodes may be added.

Then we can use that for the data import, and for __io_read() on
whether or not we need to copy state.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/opdef.c |  2 ++
 io_uring/opdef.h |  2 ++
 io_uring/rw.c    | 10 ++++++----
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 3b9c6489b8b6..f40904065500 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -63,6 +63,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
+		.vectored		= 1,
 		.prep			= io_prep_rw,
 		.issue			= io_read,
 	},
@@ -76,6 +77,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
+		.vectored		= 1,
 		.prep			= io_prep_rw,
 		.issue			= io_write,
 	},
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index c22c8696e749..9e5435ec27d0 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -29,6 +29,8 @@ struct io_issue_def {
 	unsigned		iopoll_queue : 1;
 	/* opcode specific path will handle ->async_data allocation if needed */
 	unsigned		manual_alloc : 1;
+	/* vectored opcode, set if 1) vectored, and 2) handler needs to know */
+	unsigned		vectored : 1;
 
 	int (*issue)(struct io_kiocb *, unsigned int);
 	int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 5c58962d73da..83ae911c2868 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -388,8 +388,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
 	buf = u64_to_user_ptr(rw->addr);
 	sqe_len = rw->len;
 
-	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE ||
-	    (req->flags & REQ_F_BUFFER_SELECT)) {
+	if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) {
 		if (io_do_buffer_select(req)) {
 			buf = io_buffer_select(req, &sqe_len, issue_flags);
 			if (!buf)
@@ -776,8 +775,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
 		req->flags &= ~REQ_F_REISSUE;
-		/* if we can poll, just do that */
-		if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
+		/*
+		 * If we can poll, just do that. For a vectored read, we'll
+		 * need to copy state first.
+		 */
+		if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored)
 			return -EAGAIN;
 		/* IOPOLL retry should happen for io-wq threads */
 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))

From fc68fcda049108478ee4704d8a3ad3e05cc72fd0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 11 Sep 2023 13:35:42 -0600
Subject: [PATCH 03/14] io_uring/rw: add support for IORING_OP_READ_MULTISHOT

This behaves like IORING_OP_READ, except:

1) It only supports pollable files (eg pipes, sockets, etc). Note that
   for sockets, you probably want to use recv/recvmsg with multishot
   instead.

2) It supports multishot mode, meaning it will repeatedly trigger a
   read and fill a buffer when data is available. This allows similar
   use to recv/recvmsg but on non-sockets, where a single request will
   repeatedly post a CQE whenever data is read from it.

3) Because of #2, it must be used with provided buffers. This is
   uniformly true across any request type that supports multishot and
   transfers data, with the reason being that it's obviously not
   possible to pass in a single buffer for the data, as multiple reads
   may very well trigger before an application has a chance to process
   previous CQEs and the data passed from them.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  1 +
 io_uring/opdef.c              | 13 ++++++-
 io_uring/rw.c                 | 67 +++++++++++++++++++++++++++++++++++
 io_uring/rw.h                 |  2 ++
 4 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8e61f8b7c2ce..d127948b0d8a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -240,6 +240,7 @@ enum io_uring_op {
 	IORING_OP_URING_CMD,
 	IORING_OP_SEND_ZC,
 	IORING_OP_SENDMSG_ZC,
+	IORING_OP_READ_MULTISHOT,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index f40904065500..a3fb1f9b3998 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -430,9 +430,17 @@ const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
+	[IORING_OP_READ_MULTISHOT] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.buffer_select		= 1,
+		.audit_skip		= 1,
+		.prep			= io_read_mshot_prep,
+		.issue			= io_read_mshot,
+	},
 };
 
-
 const struct io_cold_def io_cold_defs[] = {
 	[IORING_OP_NOP] = {
 		.name			= "NOP",
@@ -650,6 +658,9 @@ const struct io_cold_def io_cold_defs[] = {
 		.fail			= io_sendrecv_fail,
 #endif
 	},
+	[IORING_OP_READ_MULTISHOT] = {
+		.name			= "READ_MULTISHOT",
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 83ae911c2868..ec0cc38ea682 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -123,6 +123,22 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
+/*
+ * Multishot read is prepared just like a normal read/write request, only
+ * difference is that we set the MULTISHOT flag.
+ */
+int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	int ret;
+
+	ret = io_prep_rw(req, sqe);
+	if (unlikely(ret))
+		return ret;
+
+	req->flags |= REQ_F_APOLL_MULTISHOT;
+	return 0;
+}
+
 void io_readv_writev_cleanup(struct io_kiocb *req)
 {
 	struct io_async_rw *io = req->async_data;
@@ -869,6 +885,57 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
 	return ret;
 }
 
+int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
+{
+	unsigned int cflags = 0;
+	int ret;
+
+	/*
+	 * Multishot MUST be used on a pollable file
+	 */
+	if (!file_can_poll(req->file))
+		return -EBADFD;
+
+	ret = __io_read(req, issue_flags);
+
+	/*
+	 * If we get -EAGAIN, recycle our buffer and just let normal poll
+	 * handling arm it.
+	 */
+	if (ret == -EAGAIN) {
+		io_kbuf_recycle(req, issue_flags);
+		return -EAGAIN;
+	}
+
+	/*
+	 * Any successful return value will keep the multishot read armed.
+	 */
+	if (ret > 0) {
+		/*
+		 * Put our buffer and post a CQE. If we fail to post a CQE, then
+		 * jump to the termination path. This request is then done.
+		 */
+		cflags = io_put_kbuf(req, issue_flags);
+
+		if (io_fill_cqe_req_aux(req,
+					issue_flags & IO_URING_F_COMPLETE_DEFER,
+					ret, cflags | IORING_CQE_F_MORE)) {
+			if (issue_flags & IO_URING_F_MULTISHOT)
+				return IOU_ISSUE_SKIP_COMPLETE;
+			return -EAGAIN;
+		}
+	}
+
+	/*
+	 * Either an error, or we've hit overflow posting the CQE. For any
+	 * multishot request, hitting overflow will terminate it.
+	 */
+	io_req_set_res(req, ret, cflags);
+	if (issue_flags & IO_URING_F_MULTISHOT)
+		return IOU_STOP_MULTISHOT;
+	return IOU_OK;
+}
+
 int io_write(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 4b89f9659366..c5aed03d42a4 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -23,3 +23,5 @@ int io_writev_prep_async(struct io_kiocb *req);
 void io_readv_writev_cleanup(struct io_kiocb *req);
 void io_rw_fail(struct io_kiocb *req);
 void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
+int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags);

From 9d900d4ea352069de8728f11fe4b20051d64cc20 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 11 Jul 2023 10:31:10 -0600
Subject: [PATCH 04/14] exit: abstract out should_wake helper for
 child_wait_callback()

Abstract out the helper that decides if we should wake up following
a wake_up() callback on our internal waitqueue.

No functional changes intended in this patch.

Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/exit.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index edb50b4c9972..2809dad69492 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1520,6 +1520,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 	return 0;
 }
 
+static bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
+{
+	if (!eligible_pid(wo, p))
+		return false;
+
+	if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
+		return false;
+
+	return true;
+}
+
 static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
 				int sync, void *key)
 {
@@ -1527,13 +1538,10 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
 						child_wait);
 	struct task_struct *p = key;
 
-	if (!eligible_pid(wo, p))
-		return 0;
+	if (pid_child_should_wake(wo, p))
+		return default_wake_function(wait, mode, sync, key);
 
-	if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
-		return 0;
-
-	return default_wake_function(wait, mode, sync, key);
+	return 0;
 }
 
 void __wake_up_parent(struct task_struct *p, struct task_struct *parent)

From 06a101ca45b296fe951692620b0bc49abf90c368 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 11 Jul 2023 10:34:37 -0600
Subject: [PATCH 05/14] exit: move core of do_wait() into helper

Rather than have a maze of gotos, put the actual logic in __do_wait()
and have do_wait() loop deal with waitqueue setup/teardown and whether
to call __do_wait() again.

No functional changes intended in this patch.

Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/exit.c | 51 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 2809dad69492..c6fba9ecca27 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1590,16 +1590,10 @@ static int do_wait_pid(struct wait_opts *wo)
 	return 0;
 }
 
-static long do_wait(struct wait_opts *wo)
+static long __do_wait(struct wait_opts *wo)
 {
-	int retval;
+	long retval;
 
-	trace_sched_process_wait(wo->wo_pid);
-
-	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
-	wo->child_wait.private = current;
-	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
-repeat:
 	/*
 	 * If there is nothing that can match our criteria, just get out.
 	 * We will clear ->notask_error to zero if we see any child that
@@ -1611,24 +1605,23 @@ static long do_wait(struct wait_opts *wo)
 	   (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
 		goto notask;
 
-	set_current_state(TASK_INTERRUPTIBLE);
 	read_lock(&tasklist_lock);
 
 	if (wo->wo_type == PIDTYPE_PID) {
 		retval = do_wait_pid(wo);
 		if (retval)
-			goto end;
+			return retval;
 	} else {
 		struct task_struct *tsk = current;
 
 		do {
 			retval = do_wait_thread(wo, tsk);
 			if (retval)
-				goto end;
+				return retval;
 
 			retval = ptrace_do_wait(wo, tsk);
 			if (retval)
-				goto end;
+				return retval;
 
 			if (wo->wo_flags & __WNOTHREAD)
 				break;
@@ -1638,14 +1631,32 @@ static long do_wait(struct wait_opts *wo)
 
 notask:
 	retval = wo->notask_error;
-	if (!retval && !(wo->wo_flags & WNOHANG)) {
-		retval = -ERESTARTSYS;
-		if (!signal_pending(current)) {
-			schedule();
-			goto repeat;
-		}
-	}
-end:
+	if (!retval && !(wo->wo_flags & WNOHANG))
+		return -ERESTARTSYS;
+
+	return retval;
+}
+
+static long do_wait(struct wait_opts *wo)
+{
+	int retval;
+
+	trace_sched_process_wait(wo->wo_pid);
+
+	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
+	wo->child_wait.private = current;
+	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
+
+	do {
+		set_current_state(TASK_INTERRUPTIBLE);
+		retval = __do_wait(wo);
+		if (retval != -ERESTARTSYS)
+			break;
+		if (signal_pending(current))
+			break;
+		schedule();
+	} while (1);
+
 	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
 	return retval;

From eda7e9d409ce16960d5ed28bedf8a33b2667a93c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 11 Jul 2023 10:38:23 -0600
Subject: [PATCH 06/14] exit: add kernel_waitid_prepare() helper

Move the setup logic out of kernel_waitid(), and into a separate helper.

No functional changes intended in this patch.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/exit.c | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index c6fba9ecca27..817c22bd7ae0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1662,14 +1662,13 @@ static long do_wait(struct wait_opts *wo)
 	return retval;
 }
 
-static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
-			  int options, struct rusage *ru)
+static int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+				 struct waitid_info *infop, int options,
+				 struct rusage *ru)
 {
-	struct wait_opts wo;
+	unsigned int f_flags = 0;
 	struct pid *pid = NULL;
 	enum pid_type type;
-	long ret;
-	unsigned int f_flags = 0;
 
 	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
 			__WNOTHREAD|__WCLONE|__WALL))
@@ -1712,19 +1711,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
 		return -EINVAL;
 	}
 
-	wo.wo_type	= type;
-	wo.wo_pid	= pid;
-	wo.wo_flags	= options;
-	wo.wo_info	= infop;
-	wo.wo_rusage	= ru;
+	wo->wo_type	= type;
+	wo->wo_pid	= pid;
+	wo->wo_flags	= options;
+	wo->wo_info	= infop;
+	wo->wo_rusage	= ru;
 	if (f_flags & O_NONBLOCK)
-		wo.wo_flags |= WNOHANG;
+		wo->wo_flags |= WNOHANG;
+
+	return 0;
+}
+
+static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
+			  int options, struct rusage *ru)
+{
+	struct wait_opts wo;
+	long ret;
+
+	ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
+	if (ret)
+		return ret;
 
 	ret = do_wait(&wo);
-	if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
+	if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
 		ret = -EAGAIN;
 
-	put_pid(pid);
+	put_pid(wo.wo_pid);
 	return ret;
 }
 

From 2e521a2064bf8b26cf178c0f7644a70ed1a512fa Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 11 Jul 2023 10:40:31 -0600
Subject: [PATCH 07/14] exit: add internal include file with helpers

Move struct wait_opts and waitid_info into kernel/exit.h, and include
function declarations for the recently added helpers. Make them
non-static as well.

This is in preparation for adding a waitid operation through io_uring.
With the abtracted helpers, this is now possible.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/exit.c | 32 +++++++-------------------------
 kernel/exit.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 25 deletions(-)
 create mode 100644 kernel/exit.h

diff --git a/kernel/exit.c b/kernel/exit.c
index 817c22bd7ae0..2b4a232f2f68 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -74,6 +74,8 @@
 #include <asm/unistd.h>
 #include <asm/mmu_context.h>
 
+#include "exit.h"
+
 /*
  * The default value should be high enough to not crash a system that randomly
  * crashes its kernel from time to time, but low enough to at least not permit
@@ -1037,26 +1039,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
 	return 0;
 }
 
-struct waitid_info {
-	pid_t pid;
-	uid_t uid;
-	int status;
-	int cause;
-};
-
-struct wait_opts {
-	enum pid_type		wo_type;
-	int			wo_flags;
-	struct pid		*wo_pid;
-
-	struct waitid_info	*wo_info;
-	int			wo_stat;
-	struct rusage		*wo_rusage;
-
-	wait_queue_entry_t		child_wait;
-	int			notask_error;
-};
-
 static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 {
 	return	wo->wo_type == PIDTYPE_MAX ||
@@ -1520,7 +1502,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 	return 0;
 }
 
-static bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
 {
 	if (!eligible_pid(wo, p))
 		return false;
@@ -1590,7 +1572,7 @@ static int do_wait_pid(struct wait_opts *wo)
 	return 0;
 }
 
-static long __do_wait(struct wait_opts *wo)
+long __do_wait(struct wait_opts *wo)
 {
 	long retval;
 
@@ -1662,9 +1644,9 @@ static long do_wait(struct wait_opts *wo)
 	return retval;
 }
 
-static int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
-				 struct waitid_info *infop, int options,
-				 struct rusage *ru)
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+			  struct waitid_info *infop, int options,
+			  struct rusage *ru)
 {
 	unsigned int f_flags = 0;
 	struct pid *pid = NULL;
diff --git a/kernel/exit.h b/kernel/exit.h
new file mode 100644
index 000000000000..278faa26a653
--- /dev/null
+++ b/kernel/exit.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef LINUX_WAITID_H
+#define LINUX_WAITID_H
+
+struct waitid_info {
+	pid_t pid;
+	uid_t uid;
+	int status;
+	int cause;
+};
+
+struct wait_opts {
+	enum pid_type		wo_type;
+	int			wo_flags;
+	struct pid		*wo_pid;
+
+	struct waitid_info	*wo_info;
+	int			wo_stat;
+	struct rusage		*wo_rusage;
+
+	wait_queue_entry_t		child_wait;
+	int			notask_error;
+};
+
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p);
+long __do_wait(struct wait_opts *wo);
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+			  struct waitid_info *infop, int options,
+			  struct rusage *ru);
+#endif

From f31ecf671ddc498f20219453395794ff2383e06b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 10 Jul 2023 16:14:37 -0600
Subject: [PATCH 08/14] io_uring: add IORING_OP_WAITID support

This adds support for an async version of waitid(2), in a fully async
version. If an event isn't immediately available, wait for a callback
to trigger a retry.

The format of the sqe is as follows:

sqe->len		The 'which', the idtype being queried/waited for.
sqe->fd			The 'pid' (or id) being waited for.
sqe->file_index		The 'options' being set.
sqe->addr2		A pointer to siginfo_t, if any, being filled in.

buf_index, add3, and waitid_flags are reserved/unused for now.
waitid_flags will be used for options for this request type. One
interesting use case may be to add multi-shot support, so that the
request stays armed and posts a notification every time a monitored
process state change occurs.

Note that this does not support rusage, on Arnd's recommendation.

See the waitid(2) man page for details on the arguments.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   2 +
 include/uapi/linux/io_uring.h  |   2 +
 io_uring/Makefile              |   3 +-
 io_uring/cancel.c              |   5 +
 io_uring/io_uring.c            |   3 +
 io_uring/opdef.c               |   9 +
 io_uring/waitid.c              | 372 +++++++++++++++++++++++++++++++++
 io_uring/waitid.h              |  15 ++
 8 files changed, 410 insertions(+), 1 deletion(-)
 create mode 100644 io_uring/waitid.c
 create mode 100644 io_uring/waitid.h

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 13d19b9be9f4..fe1c5d4ec56c 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -313,6 +313,8 @@ struct io_ring_ctx {
 	struct list_head	cq_overflow_list;
 	struct io_hash_table	cancel_table;
 
+	struct hlist_head	waitid_list;
+
 	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
 	struct io_sq_data	*sq_data;	/* if using sq thread polling */
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d127948b0d8a..683ac2b74721 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -65,6 +65,7 @@ struct io_uring_sqe {
 		__u32		xattr_flags;
 		__u32		msg_ring_flags;
 		__u32		uring_cmd_flags;
+		__u32		waitid_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -241,6 +242,7 @@ enum io_uring_op {
 	IORING_OP_SEND_ZC,
 	IORING_OP_SENDMSG_ZC,
 	IORING_OP_READ_MULTISHOT,
+	IORING_OP_WAITID,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 8cc8e5387a75..7bd64e442567 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,6 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
 					sqpoll.o fdinfo.o tctx.o poll.o \
-					cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
+					cancel.o kbuf.o rsrc.o rw.o opdef.o \
+					notif.o waitid.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 7b23607cf4af..eb77a51c5a79 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -15,6 +15,7 @@
 #include "tctx.h"
 #include "poll.h"
 #include "timeout.h"
+#include "waitid.h"
 #include "cancel.h"
 
 struct io_cancel {
@@ -119,6 +120,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
 	if (ret != -ENOENT)
 		return ret;
 
+	ret = io_waitid_cancel(ctx, cd, issue_flags);
+	if (ret != -ENOENT)
+		return ret;
+
 	spin_lock(&ctx->completion_lock);
 	if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
 		ret = io_timeout_cancel(ctx, cd);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 783ed0fff71b..2dff4772bf14 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -92,6 +92,7 @@
 #include "cancel.h"
 #include "net.h"
 #include "notif.h"
+#include "waitid.h"
 
 #include "timeout.h"
 #include "poll.h"
@@ -348,6 +349,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->tctx_list);
 	ctx->submit_state.free_list.next = NULL;
 	INIT_WQ_LIST(&ctx->locked_free_list);
+	INIT_HLIST_HEAD(&ctx->waitid_list);
 	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
 	return ctx;
@@ -3303,6 +3305,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	ret |= io_cancel_defer_files(ctx, task, cancel_all);
 	mutex_lock(&ctx->uring_lock);
 	ret |= io_poll_remove_all(ctx, task, cancel_all);
+	ret |= io_waitid_remove_all(ctx, task, cancel_all);
 	mutex_unlock(&ctx->uring_lock);
 	ret |= io_kill_timeouts(ctx, task, cancel_all);
 	if (task)
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index a3fb1f9b3998..aadcbf7136b0 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -33,6 +33,7 @@
 #include "poll.h"
 #include "cancel.h"
 #include "rw.h"
+#include "waitid.h"
 
 static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
 {
@@ -439,6 +440,10 @@ const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_read_mshot_prep,
 		.issue			= io_read_mshot,
 	},
+	[IORING_OP_WAITID] = {
+		.prep			= io_waitid_prep,
+		.issue			= io_waitid,
+	},
 };
 
 const struct io_cold_def io_cold_defs[] = {
@@ -661,6 +666,10 @@ const struct io_cold_def io_cold_defs[] = {
 	[IORING_OP_READ_MULTISHOT] = {
 		.name			= "READ_MULTISHOT",
 	},
+	[IORING_OP_WAITID] = {
+		.name			= "WAITID",
+		.async_size		= sizeof(struct io_waitid_async),
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
new file mode 100644
index 000000000000..6f851978606d
--- /dev/null
+++ b/io_uring/waitid.c
@@ -0,0 +1,372 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Support for async notification of waitid
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/compat.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "cancel.h"
+#include "waitid.h"
+#include "../kernel/exit.h"
+
+static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts);
+
+#define IO_WAITID_CANCEL_FLAG	BIT(31)
+#define IO_WAITID_REF_MASK	GENMASK(30, 0)
+
+struct io_waitid {
+	struct file *file;
+	int which;
+	pid_t upid;
+	int options;
+	atomic_t refs;
+	struct wait_queue_head *head;
+	struct siginfo __user *infop;
+	struct waitid_info info;
+};
+
+static void io_waitid_free(struct io_kiocb *req)
+{
+	struct io_waitid_async *iwa = req->async_data;
+
+	put_pid(iwa->wo.wo_pid);
+	kfree(req->async_data);
+	req->async_data = NULL;
+	req->flags &= ~REQ_F_ASYNC_DATA;
+}
+
+#ifdef CONFIG_COMPAT
+static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo)
+{
+	struct compat_siginfo __user *infop;
+	bool ret;
+
+	infop = (struct compat_siginfo __user *) iw->infop;
+
+	if (!user_write_access_begin(infop, sizeof(*infop)))
+		return false;
+
+	unsafe_put_user(signo, &infop->si_signo, Efault);
+	unsafe_put_user(0, &infop->si_errno, Efault);
+	unsafe_put_user(iw->info.cause, &infop->si_code, Efault);
+	unsafe_put_user(iw->info.pid, &infop->si_pid, Efault);
+	unsafe_put_user(iw->info.uid, &infop->si_uid, Efault);
+	unsafe_put_user(iw->info.status, &infop->si_status, Efault);
+	ret = true;
+done:
+	user_write_access_end();
+	return ret;
+Efault:
+	ret = false;
+	goto done;
+}
+#endif
+
+static bool io_waitid_copy_si(struct io_kiocb *req, int signo)
+{
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+	bool ret;
+
+	if (!iw->infop)
+		return true;
+
+#ifdef CONFIG_COMPAT
+	if (req->ctx->compat)
+		return io_waitid_compat_copy_si(iw, signo);
+#endif
+
+	if (!user_write_access_begin(iw->infop, sizeof(*iw->infop)))
+		return false;
+
+	unsafe_put_user(signo, &iw->infop->si_signo, Efault);
+	unsafe_put_user(0, &iw->infop->si_errno, Efault);
+	unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault);
+	unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault);
+	unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault);
+	unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault);
+	ret = true;
+done:
+	user_write_access_end();
+	return ret;
+Efault:
+	ret = false;
+	goto done;
+}
+
+static int io_waitid_finish(struct io_kiocb *req, int ret)
+{
+	int signo = 0;
+
+	if (ret > 0) {
+		signo = SIGCHLD;
+		ret = 0;
+	}
+
+	if (!io_waitid_copy_si(req, signo))
+		ret = -EFAULT;
+	io_waitid_free(req);
+	return ret;
+}
+
+static void io_waitid_complete(struct io_kiocb *req, int ret)
+{
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+	struct io_tw_state ts = { .locked = true };
+
+	/* anyone completing better be holding a reference */
+	WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK));
+
+	lockdep_assert_held(&req->ctx->uring_lock);
+
+	/*
+	 * Did cancel find it meanwhile?
+	 */
+	if (hlist_unhashed(&req->hash_node))
+		return;
+
+	hlist_del_init(&req->hash_node);
+
+	ret = io_waitid_finish(req, ret);
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	io_req_task_complete(req, &ts);
+}
+
+static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+	struct io_waitid_async *iwa = req->async_data;
+
+	/*
+	 * Mark us canceled regardless of ownership. This will prevent a
+	 * potential retry from a spurious wakeup.
+	 */
+	atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs);
+
+	/* claim ownership */
+	if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
+		return false;
+
+	spin_lock_irq(&iw->head->lock);
+	list_del_init(&iwa->wo.child_wait.entry);
+	spin_unlock_irq(&iw->head->lock);
+	io_waitid_complete(req, -ECANCELED);
+	return true;
+}
+
+int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+		     unsigned int issue_flags)
+{
+	struct hlist_node *tmp;
+	struct io_kiocb *req;
+	int nr = 0;
+
+	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
+		return -ENOENT;
+
+	io_ring_submit_lock(ctx, issue_flags);
+	hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
+		if (req->cqe.user_data != cd->data &&
+		    !(cd->flags & IORING_ASYNC_CANCEL_ANY))
+			continue;
+		if (__io_waitid_cancel(ctx, req))
+			nr++;
+		if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
+			break;
+	}
+	io_ring_submit_unlock(ctx, issue_flags);
+
+	if (nr)
+		return nr;
+
+	return -ENOENT;
+}
+
+bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+			  bool cancel_all)
+{
+	struct hlist_node *tmp;
+	struct io_kiocb *req;
+	bool found = false;
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
+		if (!io_match_task_safe(req, task, cancel_all))
+			continue;
+		__io_waitid_cancel(ctx, req);
+		found = true;
+	}
+
+	return found;
+}
+
+static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req)
+{
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+	struct io_waitid_async *iwa = req->async_data;
+
+	if (!atomic_sub_return(1, &iw->refs))
+		return false;
+
+	/*
+	 * Wakeup triggered, racing with us. It was prevented from
+	 * completing because of that, queue up the tw to do that.
+	 */
+	req->io_task_work.func = io_waitid_cb;
+	io_req_task_work_add(req);
+	remove_wait_queue(iw->head, &iwa->wo.child_wait);
+	return true;
+}
+
+static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts)
+{
+	struct io_waitid_async *iwa = req->async_data;
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret;
+
+	io_tw_lock(ctx, ts);
+
+	ret = __do_wait(&iwa->wo);
+
+	/*
+	 * If we get -ERESTARTSYS here, we need to re-arm and check again
+	 * to ensure we get another callback. If the retry works, then we can
+	 * just remove ourselves from the waitqueue again and finish the
+	 * request.
+	 */
+	if (unlikely(ret == -ERESTARTSYS)) {
+		struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+
+		/* Don't retry if cancel found it meanwhile */
+		ret = -ECANCELED;
+		if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) {
+			iw->head = &current->signal->wait_chldexit;
+			add_wait_queue(iw->head, &iwa->wo.child_wait);
+			ret = __do_wait(&iwa->wo);
+			if (ret == -ERESTARTSYS) {
+				/* retry armed, drop our ref */
+				io_waitid_drop_issue_ref(req);
+				return;
+			}
+
+			remove_wait_queue(iw->head, &iwa->wo.child_wait);
+		}
+	}
+
+	io_waitid_complete(req, ret);
+}
+
+static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode,
+			  int sync, void *key)
+{
+	struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait);
+	struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo);
+	struct io_kiocb *req = iwa->req;
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+	struct task_struct *p = key;
+
+	if (!pid_child_should_wake(wo, p))
+		return 0;
+
+	/* cancel is in progress */
+	if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
+		return 1;
+
+	req->io_task_work.func = io_waitid_cb;
+	io_req_task_work_add(req);
+	list_del_init(&wait->entry);
+	return 1;
+}
+
+int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+
+	if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags)
+		return -EINVAL;
+
+	iw->which = READ_ONCE(sqe->len);
+	iw->upid = READ_ONCE(sqe->fd);
+	iw->options = READ_ONCE(sqe->file_index);
+	iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	return 0;
+}
+
+int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_waitid_async *iwa;
+	int ret;
+
+	if (io_alloc_async_data(req))
+		return -ENOMEM;
+
+	iwa = req->async_data;
+	iwa->req = req;
+
+	ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info,
+					iw->options, NULL);
+	if (ret)
+		goto done;
+
+	/*
+	 * Mark the request as busy upfront, in case we're racing with the
+	 * wakeup. If we are, then we'll notice when we drop this initial
+	 * reference again after arming.
+	 */
+	atomic_set(&iw->refs, 1);
+
+	/*
+	 * Cancel must hold the ctx lock, so there's no risk of cancelation
+	 * finding us until a) we remain on the list, and b) the lock is
+	 * dropped. We only need to worry about racing with the wakeup
+	 * callback.
+	 */
+	io_ring_submit_lock(ctx, issue_flags);
+	hlist_add_head(&req->hash_node, &ctx->waitid_list);
+
+	init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait);
+	iwa->wo.child_wait.private = req->task;
+	iw->head = &current->signal->wait_chldexit;
+	add_wait_queue(iw->head, &iwa->wo.child_wait);
+
+	ret = __do_wait(&iwa->wo);
+	if (ret == -ERESTARTSYS) {
+		/*
+		 * Nobody else grabbed a reference, it'll complete when we get
+		 * a waitqueue callback, or if someone cancels it.
+		 */
+		if (!io_waitid_drop_issue_ref(req)) {
+			io_ring_submit_unlock(ctx, issue_flags);
+			return IOU_ISSUE_SKIP_COMPLETE;
+		}
+
+		/*
+		 * Wakeup triggered, racing with us. It was prevented from
+		 * completing because of that, queue up the tw to do that.
+		 */
+		io_ring_submit_unlock(ctx, issue_flags);
+		return IOU_ISSUE_SKIP_COMPLETE;
+	}
+
+	hlist_del_init(&req->hash_node);
+	remove_wait_queue(iw->head, &iwa->wo.child_wait);
+	ret = io_waitid_finish(req, ret);
+
+	io_ring_submit_unlock(ctx, issue_flags);
+done:
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
diff --git a/io_uring/waitid.h b/io_uring/waitid.h
new file mode 100644
index 000000000000..956a8adafe8c
--- /dev/null
+++ b/io_uring/waitid.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "../kernel/exit.h"
+
+struct io_waitid_async {
+	struct io_kiocb *req;
+	struct wait_opts wo;
+};
+
+int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_waitid(struct io_kiocb *req, unsigned int issue_flags);
+int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+		     unsigned int issue_flags);
+bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+			  bool cancel_all);

From 528ce6781726e022bc5dc84034360e6e8f1b89bd Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 28 Sep 2023 20:43:24 +0800
Subject: [PATCH 09/14] io_uring: retain top 8bits of uring_cmd flags for
 kernel internal use

Retain top 8bits of uring_cmd flags for kernel internal use, so that we
can move IORING_URING_CMD_POLLED out of uapi header.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h      | 3 +++
 include/uapi/linux/io_uring.h | 5 ++---
 io_uring/io_uring.c           | 3 +++
 io_uring/uring_cmd.c          | 2 +-
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 106cdc55ff3b..ae08d6f66e62 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -22,6 +22,9 @@ enum io_uring_cmd_flags {
 	IO_URING_F_IOPOLL		= (1 << 10),
 };
 
+/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
+#define IORING_URING_CMD_POLLED		(1U << 31)
+
 struct io_uring_cmd {
 	struct file	*file;
 	const struct io_uring_sqe *sqe;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 683ac2b74721..425f64eee44e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -249,13 +249,12 @@ enum io_uring_op {
 };
 
 /*
- * sqe->uring_cmd_flags
+ * sqe->uring_cmd_flags		top 8bits aren't available for userspace
  * IORING_URING_CMD_FIXED	use registered buffer; pass this flag
  *				along with setting sqe->buf_index.
- * IORING_URING_CMD_POLLED	driver use only
  */
 #define IORING_URING_CMD_FIXED	(1U << 0)
-#define IORING_URING_CMD_POLLED	(1U << 31)
+#define IORING_URING_CMD_MASK	IORING_URING_CMD_FIXED
 
 
 /*
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2dff4772bf14..cb6bd9907045 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4669,6 +4669,9 @@ static int __init io_uring_init(void)
 
 	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
 
+	/* top 8bits are for internal use */
+	BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
+
 	io_uring_optable_init();
 
 	/*
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 537795fddc87..a0b0ec5473bf 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -91,7 +91,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 
 	ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags);
-	if (ioucmd->flags & ~IORING_URING_CMD_FIXED)
+	if (ioucmd->flags & ~IORING_URING_CMD_MASK)
 		return -EINVAL;
 
 	if (ioucmd->flags & IORING_URING_CMD_FIXED) {

From 93b8cc60c37b9d17732b7a297e5dca29b50a990d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 28 Sep 2023 20:43:25 +0800
Subject: [PATCH 10/14] io_uring: cancelable uring_cmd

uring_cmd may never complete, such as ublk, in which uring cmd isn't
completed until one new block request is coming from ublk block device.

Add cancelable uring_cmd to provide mechanism to driver for cancelling
pending commands in its own way.

Add API of io_uring_cmd_mark_cancelable() for driver to mark one command as
cancelable, then io_uring will cancel this command in
io_uring_cancel_generic(). ->uring_cmd() callback is reused for canceling
command in driver's way, then driver gets notified with the cancelling
from io_uring.

Add API of io_uring_cmd_get_task() to help driver cancel handler
deal with the canceling.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h       | 15 +++++++++++
 include/linux/io_uring_types.h |  6 +++++
 io_uring/io_uring.c            | 33 ++++++++++++++++++++++++
 io_uring/uring_cmd.c           | 47 ++++++++++++++++++++++++++++++++++
 4 files changed, 101 insertions(+)

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index ae08d6f66e62..b4391e0a9bc8 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -20,9 +20,13 @@ enum io_uring_cmd_flags {
 	IO_URING_F_SQE128		= (1 << 8),
 	IO_URING_F_CQE32		= (1 << 9),
 	IO_URING_F_IOPOLL		= (1 << 10),
+
+	/* set when uring wants to cancel a previously issued command */
+	IO_URING_F_CANCEL		= (1 << 11),
 };
 
 /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
+#define IORING_URING_CMD_CANCELABLE	(1U << 30)
 #define IORING_URING_CMD_POLLED		(1U << 31)
 
 struct io_uring_cmd {
@@ -85,6 +89,9 @@ static inline void io_uring_free(struct task_struct *tsk)
 		__io_uring_free(tsk);
 }
 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
+void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags);
+struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
 #else
 static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd)
@@ -125,6 +132,14 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
 {
 	return -EOPNOTSUPP;
 }
+static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags)
+{
+}
+static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+{
+	return NULL;
+}
 #endif
 
 #endif
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index fe1c5d4ec56c..e178461fa513 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -265,6 +265,12 @@ struct io_ring_ctx {
 		 */
 		struct io_wq_work_list	iopoll_list;
 		bool			poll_multi_queue;
+
+		/*
+		 * Any cancelable uring_cmd is added to this list in
+		 * ->uring_cmd() by io_uring_cmd_insert_cancelable()
+		 */
+		struct hlist_head	cancelable_uring_cmd;
 	} ____cacheline_aligned_in_smp;
 
 	struct {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cb6bd9907045..08c9ea46bb95 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -352,6 +352,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_HLIST_HEAD(&ctx->waitid_list);
 	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+	INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
 	return ctx;
 err:
 	kfree(ctx->cancel_table.hbs);
@@ -3258,6 +3259,37 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
 	return ret;
 }
 
+static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
+		struct task_struct *task, bool cancel_all)
+{
+	struct hlist_node *tmp;
+	struct io_kiocb *req;
+	bool ret = false;
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
+			hash_node) {
+		struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
+				struct io_uring_cmd);
+		struct file *file = req->file;
+
+		if (!cancel_all && req->task != task)
+			continue;
+
+		if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
+			/* ->sqe isn't available if no async data */
+			if (!req_has_async_data(req))
+				cmd->sqe = NULL;
+			file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
+			ret = true;
+		}
+	}
+	io_submit_flush_completions(ctx);
+
+	return ret;
+}
+
 static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 						struct task_struct *task,
 						bool cancel_all)
@@ -3306,6 +3338,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	mutex_lock(&ctx->uring_lock);
 	ret |= io_poll_remove_all(ctx, task, cancel_all);
 	ret |= io_waitid_remove_all(ctx, task, cancel_all);
+	ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
 	mutex_unlock(&ctx->uring_lock);
 	ret |= io_kill_timeouts(ctx, task, cancel_all);
 	if (task)
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index a0b0ec5473bf..00a5e5621a28 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -13,6 +13,51 @@
 #include "rsrc.h"
 #include "uring_cmd.h"
 
+static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags)
+{
+	struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (!(cmd->flags & IORING_URING_CMD_CANCELABLE))
+		return;
+
+	cmd->flags &= ~IORING_URING_CMD_CANCELABLE;
+	io_ring_submit_lock(ctx, issue_flags);
+	hlist_del(&req->hash_node);
+	io_ring_submit_unlock(ctx, issue_flags);
+}
+
+/*
+ * Mark this command as concelable, then io_uring_try_cancel_uring_cmd()
+ * will try to cancel this issued command by sending ->uring_cmd() with
+ * issue_flags of IO_URING_F_CANCEL.
+ *
+ * The command is guaranteed to not be done when calling ->uring_cmd()
+ * with IO_URING_F_CANCEL, but it is driver's responsibility to deal
+ * with race between io_uring canceling and normal completion.
+ */
+void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags)
+{
+	struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
+		cmd->flags |= IORING_URING_CMD_CANCELABLE;
+		io_ring_submit_lock(ctx, issue_flags);
+		hlist_add_head(&req->hash_node, &ctx->cancelable_uring_cmd);
+		io_ring_submit_unlock(ctx, issue_flags);
+	}
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
+
+struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+{
+	return cmd_to_io_kiocb(cmd)->task;
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_get_task);
+
 static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
@@ -56,6 +101,8 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
 {
 	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
 
+	io_uring_cmd_del_cancelable(ioucmd, issue_flags);
+
 	if (ret < 0)
 		req_set_fail(req);
 

From 922a2c78f13611e2c08fc48f615c0cd367dcb6da Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 2 Oct 2023 18:25:23 -0600
Subject: [PATCH 11/14] io_uring/rsrc: cleanup io_pin_pages()

This function is overly convoluted with a goto error path, and checks
under the mmap_read_lock() that don't need to be at all. Rearrange it
a bit so the checks and errors fall out naturally, rather than needing
to jump around for it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d9c853d10587..7034be555334 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1037,39 +1037,36 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
 {
 	unsigned long start, end, nr_pages;
 	struct page **pages = NULL;
-	int pret, ret = -ENOMEM;
+	int ret;
 
 	end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	start = ubuf >> PAGE_SHIFT;
 	nr_pages = end - start;
+	WARN_ON(!nr_pages);
 
 	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
 	if (!pages)
-		goto done;
+		return ERR_PTR(-ENOMEM);
 
-	ret = 0;
 	mmap_read_lock(current->mm);
-	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
-			      pages);
-	if (pret == nr_pages)
-		*npages = nr_pages;
-	else
-		ret = pret < 0 ? pret : -EFAULT;
-
+	ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages);
 	mmap_read_unlock(current->mm);
-	if (ret) {
+
+	/* success, mapped all pages */
+	if (ret == nr_pages) {
+		*npages = nr_pages;
+		return pages;
+	}
+
+	/* partial map, or didn't map anything */
+	if (ret >= 0) {
 		/* if we did partial map, release any pages we did get */
-		if (pret > 0)
-			unpin_user_pages(pages, pret);
-		goto done;
+		if (ret)
+			unpin_user_pages(pages, ret);
+		ret = -EFAULT;
 	}
-	ret = 0;
-done:
-	if (ret < 0) {
-		kvfree(pages);
-		pages = ERR_PTR(ret);
-	}
-	return pages;
+	kvfree(pages);
+	return ERR_PTR(ret);
 }
 
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,

From ab69838e7c75b0edb699c1a8f42752b30333c46f Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Wed, 4 Oct 2023 20:05:29 -0400
Subject: [PATCH 12/14] io_uring/kbuf: Fix check of BID wrapping in provided
 buffers

Commit 3851d25c75ed0 ("io_uring: check for rollover of buffer ID when
providing buffers") introduced a check to prevent wrapping the BID
counter when sqe->off is provided, but it's off-by-one too
restrictive, rejecting the last possible BID (65534).

i.e., the following fails with -EINVAL.

     io_uring_prep_provide_buffers(sqe, addr, size, 0xFFFF, 0, 0);

Fixes: 3851d25c75ed ("io_uring: check for rollover of buffer ID when providing buffers")
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/20231005000531.30800-2-krisman@suse.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 556f4df25b0f..52dba81c3f50 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -352,7 +352,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	tmp = READ_ONCE(sqe->off);
 	if (tmp > USHRT_MAX)
 		return -E2BIG;
-	if (tmp + p->nbufs >= USHRT_MAX)
+	if (tmp + p->nbufs > USHRT_MAX)
 		return -EINVAL;
 	p->bid = tmp;
 	return 0;

From f74c746e476b9dad51448b9a9421aae72b60e25f Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Wed, 4 Oct 2023 20:05:30 -0400
Subject: [PATCH 13/14] io_uring/kbuf: Allow the full buffer id space for
 provided buffers

nbufs tracks the number of buffers and not the last bgid. In 16-bit, we
have 2^16 valid buffers, but the check mistakenly rejects the last
bid. Let's fix it to make the interface consistent with the
documentation.

Fixes: ddf0322db79c ("io_uring: add IORING_OP_PROVIDE_BUFFERS")
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/20231005000531.30800-3-krisman@suse.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 52dba81c3f50..12a357348733 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -19,12 +19,15 @@
 
 #define BGID_ARRAY	64
 
+/* BIDs are addressed by a 16-bit field in a CQE */
+#define MAX_BIDS_PER_BGID (1 << 16)
+
 struct io_provide_buf {
 	struct file			*file;
 	__u64				addr;
 	__u32				len;
 	__u32				bgid;
-	__u16				nbufs;
+	__u32				nbufs;
 	__u16				bid;
 };
 
@@ -289,7 +292,7 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 
 	tmp = READ_ONCE(sqe->fd);
-	if (!tmp || tmp > USHRT_MAX)
+	if (!tmp || tmp > MAX_BIDS_PER_BGID)
 		return -EINVAL;
 
 	memset(p, 0, sizeof(*p));
@@ -332,7 +335,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 		return -EINVAL;
 
 	tmp = READ_ONCE(sqe->fd);
-	if (!tmp || tmp > USHRT_MAX)
+	if (!tmp || tmp > MAX_BIDS_PER_BGID)
 		return -E2BIG;
 	p->nbufs = tmp;
 	p->addr = READ_ONCE(sqe->addr);
@@ -352,7 +355,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	tmp = READ_ONCE(sqe->off);
 	if (tmp > USHRT_MAX)
 		return -E2BIG;
-	if (tmp + p->nbufs > USHRT_MAX)
+	if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
 		return -EINVAL;
 	p->bid = tmp;
 	return 0;

From b3a4dbc89d4021b3f90ff6a13537111a004f9d07 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Wed, 4 Oct 2023 20:05:31 -0400
Subject: [PATCH 14/14] io_uring/kbuf: Use slab for struct io_buffer objects

The allocation of struct io_buffer for metadata of provided buffers is
done through a custom allocator that directly gets pages and
fragments them.  But, slab would do just fine, as this is not a hot path
(in fact, it is a deprecated feature) and, by keeping a custom allocator
implementation we lose benefits like tracking, poisoning,
sanitizers. Finally, the custom code is more complex and requires
keeping the list of pages in struct ctx for no good reason.  This patch
cleans this path up and just uses slab.

I microbenchmarked it by forcing the allocation of a large number of
objects with the least number of io_uring commands possible (keeping
nbufs=USHRT_MAX), with and without the patch.  There is a slight
increase in time spent in the allocation with slab, of course, but even
when allocating to system resources exhaustion, which is not very
realistic and happened around 1/2 billion provided buffers for me, it
wasn't a significant hit in system time.  Specially if we think of a
real-world scenario, an application doing register/unregister of
provided buffers will hit ctx->io_buffers_cache more often than actually
going to slab.

Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/20231005000531.30800-4-krisman@suse.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 --
 io_uring/io_uring.c            |  4 ++-
 io_uring/io_uring.h            |  1 +
 io_uring/kbuf.c                | 47 +++++++++++++++++++---------------
 4 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e178461fa513..e4e67899b134 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -350,8 +350,6 @@ struct io_ring_ctx {
 	struct wait_queue_head		rsrc_quiesce_wq;
 	unsigned			rsrc_quiesce;
 
-	struct list_head		io_buffers_pages;
-
 	#if defined(CONFIG_UNIX)
 		struct socket		*ring_sock;
 	#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 08c9ea46bb95..b9e1af5772f3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -339,7 +339,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	spin_lock_init(&ctx->completion_lock);
 	spin_lock_init(&ctx->timeout_lock);
 	INIT_WQ_LIST(&ctx->iopoll_list);
-	INIT_LIST_HEAD(&ctx->io_buffers_pages);
 	INIT_LIST_HEAD(&ctx->io_buffers_comp);
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
@@ -4720,6 +4719,9 @@ static int __init io_uring_init(void)
 				SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
 				offsetof(struct io_kiocb, cmd.data),
 				sizeof_field(struct io_kiocb, cmd.data), NULL);
+	io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0,
+					  SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+					  NULL);
 
 #ifdef CONFIG_SYSCTL
 	register_sysctl_init("kernel", kernel_io_uring_disabled_table);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 547c30582fb8..2ff719ae1b57 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -330,6 +330,7 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
 }
 
 extern struct kmem_cache *req_cachep;
+extern struct kmem_cache *io_buf_cachep;
 
 static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
 {
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 12a357348733..d5a04467666f 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -22,6 +22,8 @@
 /* BIDs are addressed by a 16-bit field in a CQE */
 #define MAX_BIDS_PER_BGID (1 << 16)
 
+struct kmem_cache *io_buf_cachep;
+
 struct io_provide_buf {
 	struct file			*file;
 	__u64				addr;
@@ -258,6 +260,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 void io_destroy_buffers(struct io_ring_ctx *ctx)
 {
 	struct io_buffer_list *bl;
+	struct list_head *item, *tmp;
+	struct io_buffer *buf;
 	unsigned long index;
 	int i;
 
@@ -273,12 +277,9 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
 		kfree(bl);
 	}
 
-	while (!list_empty(&ctx->io_buffers_pages)) {
-		struct page *page;
-
-		page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
-		list_del_init(&page->lru);
-		__free_page(page);
+	list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
+		buf = list_entry(item, struct io_buffer, list);
+		kmem_cache_free(io_buf_cachep, buf);
 	}
 }
 
@@ -361,11 +362,12 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	return 0;
 }
 
+#define IO_BUFFER_ALLOC_BATCH 64
+
 static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
 {
-	struct io_buffer *buf;
-	struct page *page;
-	int bufs_in_page;
+	struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
+	int allocated;
 
 	/*
 	 * Completions that don't happen inline (eg not under uring_lock) will
@@ -385,22 +387,25 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
 
 	/*
 	 * No free buffers and no completion entries either. Allocate a new
-	 * page worth of buffer entries and add those to our freelist.
+	 * batch of buffer entries and add those to our freelist.
 	 */
-	page = alloc_page(GFP_KERNEL_ACCOUNT);
-	if (!page)
-		return -ENOMEM;
 
-	list_add(&page->lru, &ctx->io_buffers_pages);
-
-	buf = page_address(page);
-	bufs_in_page = PAGE_SIZE / sizeof(*buf);
-	while (bufs_in_page) {
-		list_add_tail(&buf->list, &ctx->io_buffers_cache);
-		buf++;
-		bufs_in_page--;
+	allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
+					  ARRAY_SIZE(bufs), (void **) bufs);
+	if (unlikely(!allocated)) {
+		/*
+		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
+		 * retry single alloc to be on the safe side.
+		 */
+		bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
+		if (!bufs[0])
+			return -ENOMEM;
+		allocated = 1;
 	}
 
+	while (allocated)
+		list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
+
 	return 0;
 }