From 262b2fa99cbe02a715ce23981c2c30685ccf3a93 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 5 Feb 2025 19:17:47 +0100 Subject: [PATCH 01/10] pipe: introduce struct file_operations pipeanon_fops So that fifos and anonymous pipes could have different f_op methods. Preparation to simplify the next patch. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250205181747.GB13817@redhat.com Tested-by: K Prateek Nayak Signed-off-by: Christian Brauner --- fs/pipe.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 94b59045ab44..2eacfde61e74 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -878,6 +878,8 @@ static const struct dentry_operations pipefs_dentry_operations = { .d_dname = pipefs_dname, }; +static const struct file_operations pipeanon_fops; + static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); @@ -895,7 +897,7 @@ static struct inode * get_pipe_inode(void) inode->i_pipe = pipe; pipe->files = 2; pipe->readers = pipe->writers = 1; - inode->i_fop = &pipefifo_fops; + inode->i_fop = &pipeanon_fops; /* * Mark the inode dirty from the very beginning, @@ -938,7 +940,7 @@ int create_pipe_files(struct file **res, int flags) f = alloc_file_pseudo(inode, pipe_mnt, "", O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), - &pipefifo_fops); + &pipeanon_fops); if (IS_ERR(f)) { free_pipe_info(inode->i_pipe); iput(inode); @@ -949,7 +951,7 @@ int create_pipe_files(struct file **res, int flags) f->f_pipe = 0; res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), - &pipefifo_fops); + &pipeanon_fops); if (IS_ERR(res[0])) { put_pipe_info(inode, inode->i_pipe); fput(f); @@ -1107,8 +1109,8 @@ static void wake_up_partner(struct pipe_inode_info *pipe) static int fifo_open(struct inode *inode, struct file *filp) { + bool is_pipe = inode->i_fop == &pipeanon_fops; struct pipe_inode_info *pipe; - bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; int ret; filp->f_pipe = 0; @@ -1241,6 +1243,17 @@ const struct file_operations pipefifo_fops = { .splice_write = iter_file_splice_write, }; +static const struct file_operations pipeanon_fops = { + .open = fifo_open, + .read_iter = pipe_read, + .write_iter = pipe_write, + .poll = pipe_poll, + .unlocked_ioctl = pipe_ioctl, + .release = pipe_release, + .fasync = pipe_fasync, + .splice_write = iter_file_splice_write, +}; + /* * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. @@ -1388,7 +1401,9 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { struct pipe_inode_info *pipe = file->private_data; - if (file->f_op != &pipefifo_fops || !pipe) + if (!pipe) + return NULL; + if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops) return NULL; if (for_splice && pipe_has_watch_queue(pipe)) return NULL; From f017b0a4951fac8f150232661b2cc0b67e0c57f0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 5 Feb 2025 19:18:12 +0100 Subject: [PATCH 02/10] pipe: don't update {a,c,m}time for anonymous pipes These numbers are visible in fstat() but hopefully nobody uses this information and file_accessed/file_update_time are not that cheap. Stupid test-case: #include #include #include #include #include #include static char buf[17 * 4096]; static struct timeval TW, TR; int wr(int fd, int size) { int c, r; struct timeval t0, t1; gettimeofday(&t0, NULL); for (c = 0; (r = write(fd, buf, size)) > 0; c += r); gettimeofday(&t1, NULL); timeradd(&TW, &t1, &TW); timersub(&TW, &t0, &TW); return c; } int rd(int fd, int size) { int c, r; struct timeval t0, t1; gettimeofday(&t0, NULL); for (c = 0; (r = read(fd, buf, size)) > 0; c += r); gettimeofday(&t1, NULL); timeradd(&TR, &t1, &TR); timersub(&TR, &t0, &TR); return c; } int main(int argc, const char *argv[]) { int fd[2], nb = 1, loop, size; assert(argc == 3); loop = atoi(argv[1]); size = atoi(argv[2]); assert(pipe(fd) == 0); assert(ioctl(fd[0], FIONBIO, &nb) == 0); assert(ioctl(fd[1], FIONBIO, &nb) == 0); assert(size <= sizeof(buf)); while (loop--) assert(wr(fd[1], size) == rd(fd[0], size)); struct timeval tt; timeradd(&TW, &TR, &tt); printf("TW = %lu.%03lu TR = %lu.%03lu TT = %lu.%03lu\n", TW.tv_sec, TW.tv_usec/1000, TR.tv_sec, TR.tv_usec/1000, tt.tv_sec, tt.tv_usec/1000); return 0; } Before: # for i in 1 2 3; do /host/tmp/test 10000 100; done TW = 8.047 TR = 5.845 TT = 13.893 TW = 8.091 TR = 5.872 TT = 13.963 TW = 8.083 TR = 5.885 TT = 13.969 After: # for i in 1 2 3; do /host/tmp/test 10000 100; done TW = 4.752 TR = 4.664 TT = 9.416 TW = 4.684 TR = 4.608 TT = 9.293 TW = 4.736 TR = 4.652 TT = 9.388 Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250205181812.GC13817@redhat.com Tested-by: K Prateek Nayak Signed-off-by: Christian Brauner --- fs/pipe.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 2eacfde61e74..2ae75adfba64 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -248,7 +248,7 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, } static ssize_t -pipe_read(struct kiocb *iocb, struct iov_iter *to) +anon_pipe_read(struct kiocb *iocb, struct iov_iter *to) { size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; @@ -404,8 +404,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + return ret; +} + +static ssize_t +fifo_pipe_read(struct kiocb *iocb, struct iov_iter *to) +{ + int ret = anon_pipe_read(iocb, to); if (ret > 0) - file_accessed(filp); + file_accessed(iocb->ki_filp); return ret; } @@ -426,7 +433,7 @@ static inline bool pipe_writable(const struct pipe_inode_info *pipe) } static ssize_t -pipe_write(struct kiocb *iocb, struct iov_iter *from) +anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; @@ -604,11 +611,21 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { - int err = file_update_time(filp); - if (err) - ret = err; - sb_end_write(file_inode(filp)->i_sb); + return ret; +} + +static ssize_t +fifo_pipe_write(struct kiocb *iocb, struct iov_iter *from) +{ + int ret = anon_pipe_write(iocb, from); + if (ret > 0) { + struct file *filp = iocb->ki_filp; + if (sb_start_write_trylock(file_inode(filp)->i_sb)) { + int err = file_update_time(filp); + if (err) + ret = err; + sb_end_write(file_inode(filp)->i_sb); + } } return ret; } @@ -1234,8 +1251,8 @@ static int fifo_open(struct inode *inode, struct file *filp) const struct file_operations pipefifo_fops = { .open = fifo_open, - .read_iter = pipe_read, - .write_iter = pipe_write, + .read_iter = fifo_pipe_read, + .write_iter = fifo_pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, @@ -1245,8 +1262,8 @@ const struct file_operations pipefifo_fops = { static const struct file_operations pipeanon_fops = { .open = fifo_open, - .read_iter = pipe_read, - .write_iter = pipe_write, + .read_iter = anon_pipe_read, + .write_iter = anon_pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, From ee5eda8ea59546af2e8f192c060fbf29862d7cbd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 10 Feb 2025 12:40:39 +0100 Subject: [PATCH 03/10] pipe: change pipe_write() to never add a zero-sized buffer a194dfe6e6f6 ("pipe: Rearrange sequence in pipe_write() to preallocate slot") changed pipe_write() to increment pipe->head in advance. IIUC to avoid the race with the post_one_notification()-like code which can add another buffer under pipe->rd_wait.lock without pipe->mutex. This is no longer necessary after c73be61cede5 ("pipe: Add general notification queue support"), pipe_write() checks pipe_has_watch_queue() and returns -EXDEV at the start. And can't help in any case, pipe_write() no longer takes this rd_wait.lock spinlock. Change pipe_write() to call copy_page_from_iter() first and do nothing if it fails. This way pipe_write() can't add a zero-sized buffer and we can simplify pipe_read() which currently has to take care of this very unlikely case. Also, with this patch we can probably kill eat_empty_buffer() and more "is this buffer empty" checks in fs/splice.c later. Link: https://lore.kernel.org/all/20250209150718.GA17013@redhat.com/ Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250210114039.GA3588@redhat.com Tested-by: K Prateek Nayak Signed-off-by: Christian Brauner --- fs/pipe.c | 55 ++++++++++++++----------------------------------------- 1 file changed, 14 insertions(+), 41 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 2ae75adfba64..b0641f75b1ba 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -360,29 +360,9 @@ anon_pipe_read(struct kiocb *iocb, struct iov_iter *to) break; } mutex_unlock(&pipe->mutex); - /* * We only get here if we didn't actually read anything. * - * However, we could have seen (and removed) a zero-sized - * pipe buffer, and might have made space in the buffers - * that way. - * - * You can't make zero-sized pipe buffers by doing an empty - * write (not even in packet mode), but they can happen if - * the writer gets an EFAULT when trying to fill a buffer - * that already got allocated and inserted in the buffer - * array. - * - * So we still need to wake up any pending writers in the - * _very_ unlikely case that the pipe was full, but we got - * no data. - */ - if (unlikely(wake_writer)) - wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - - /* * But because we didn't read anything, at this point we can * just return directly with -ERESTARTSYS if we're interrupted, * since we've done any required wakeups and there's no need @@ -391,7 +371,6 @@ anon_pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - wake_writer = false; wake_next_reader = true; mutex_lock(&pipe->mutex); } @@ -526,33 +505,27 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) pipe->tmp_page = page; } - /* Allocate a slot in the ring in advance and attach an - * empty buffer. If we fault or otherwise fail to use - * it, either the reader will consume it or it'll still - * be there for the next write. - */ - pipe->head = head + 1; - - /* Insert it into the buffer array */ - buf = &pipe->bufs[head & mask]; - buf->page = page; - buf->ops = &anon_pipe_buf_ops; - buf->offset = 0; - buf->len = 0; - if (is_packetized(filp)) - buf->flags = PIPE_BUF_FLAG_PACKET; - else - buf->flags = PIPE_BUF_FLAG_CAN_MERGE; - pipe->tmp_page = NULL; - copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { if (!ret) ret = -EFAULT; break; } - ret += copied; + + pipe->head = head + 1; + pipe->tmp_page = NULL; + /* Insert it into the buffer array */ + buf = &pipe->bufs[head & mask]; + buf->page = page; + buf->ops = &anon_pipe_buf_ops; + buf->offset = 0; + if (is_packetized(filp)) + buf->flags = PIPE_BUF_FLAG_PACKET; + else + buf->flags = PIPE_BUF_FLAG_CAN_MERGE; + buf->len = copied; + ret += copied; if (!iov_iter_count(from)) break; From a40cd5849dab4906f54f27e28561bd9298455bcf Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Mar 2025 00:04:07 +0100 Subject: [PATCH 04/10] pipe: drop an always true check in anon_pipe_write() The check operates on the stale value of 'head' and always loops back. Just do it unconditionally. No functional changes. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250303230409.452687-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/pipe.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index b0641f75b1ba..b60487b650cb 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -529,10 +529,9 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) if (!iov_iter_count(from)) break; - } - if (!pipe_full(head, pipe->tail, pipe->max_usage)) continue; + } /* Wait for buffer space to become available. */ if ((filp->f_flags & O_NONBLOCK) || From 46af8e2406c27cc2f21094983697ff872102065f Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Mar 2025 00:04:08 +0100 Subject: [PATCH 05/10] pipe: cache 2 pages instead of 1 User data is kept in a circular buffer backed by pages allocated as needed. Only having space for one spare is still prone to having to resort to allocation / freeing. In my testing this decreases page allocs by 60% during a kernel build. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250303230409.452687-3-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/pipe.c | 60 ++++++++++++++++++++++++++------------- include/linux/pipe_fs_i.h | 2 +- 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index b60487b650cb..f2d5427fba8e 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -112,20 +112,40 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, pipe_lock(pipe2); } +static struct page *anon_pipe_get_page(struct pipe_inode_info *pipe) +{ + for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (pipe->tmp_page[i]) { + struct page *page = pipe->tmp_page[i]; + pipe->tmp_page[i] = NULL; + return page; + } + } + + return alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); +} + +static void anon_pipe_put_page(struct pipe_inode_info *pipe, + struct page *page) +{ + if (page_count(page) == 1) { + for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (!pipe->tmp_page[i]) { + pipe->tmp_page[i] = page; + return; + } + } + } + + put_page(page); +} + static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; - /* - * If nobody else uses this page, and we don't already have a - * temporary page, let's keep track of it as a one-deep - * allocation cache. (Otherwise just release our reference to it) - */ - if (page_count(page) == 1 && !pipe->tmp_page) - pipe->tmp_page = page; - else - put_page(page); + anon_pipe_put_page(pipe, page); } static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe, @@ -493,27 +513,25 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) if (!pipe_full(head, pipe->tail, pipe->max_usage)) { unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf; - struct page *page = pipe->tmp_page; + struct page *page; int copied; - if (!page) { - page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); - if (unlikely(!page)) { - ret = ret ? : -ENOMEM; - break; - } - pipe->tmp_page = page; + page = anon_pipe_get_page(pipe); + if (unlikely(!page)) { + if (!ret) + ret = -ENOMEM; + break; } copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { + anon_pipe_put_page(pipe, page); if (!ret) ret = -EFAULT; break; } pipe->head = head + 1; - pipe->tmp_page = NULL; /* Insert it into the buffer array */ buf = &pipe->bufs[head & mask]; buf->page = page; @@ -846,8 +864,10 @@ void free_pipe_info(struct pipe_inode_info *pipe) if (pipe->watch_queue) put_watch_queue(pipe->watch_queue); #endif - if (pipe->tmp_page) - __free_page(pipe->tmp_page); + for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (pipe->tmp_page[i]) + __free_page(pipe->tmp_page[i]); + } kfree(pipe->bufs); kfree(pipe); } diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 8ff23bf5a819..eb7994a1ff93 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -72,7 +72,7 @@ struct pipe_inode_info { #ifdef CONFIG_WATCH_QUEUE bool note_loss; #endif - struct page *tmp_page; + struct page *tmp_page[2]; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; From 84654c7f47307692d47ea914d01287c8c54b3532 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Mar 2025 00:04:09 +0100 Subject: [PATCH 06/10] wait: avoid spurious calls to prepare_to_wait_event() in ___wait_event() In vast majority of cases the condition determining whether the thread can proceed is true after the first wake up. However, even in that case the thread ends up calling into prepare_to_wait_event() again, suffering a spurious irq + lock trip. Then it calls into finish_wait() to unlink itself. Note that in case of a pending signal the work done by prepare_to_wait_event() gets ignored even without the change. pre-check the condition after waking up instead. Stats gathared during a kernel build: bpftrace -e 'kprobe:prepare_to_wait_event,kprobe:finish_wait \ { @[probe] = count(); }' @[kprobe:finish_wait]: 392483 @[kprobe:prepare_to_wait_event]: 778690 As in calls to prepare_to_wait_event() almost double calls to finish_wait(). This evens out with the patch. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250303230409.452687-4-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/wait.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/wait.h b/include/linux/wait.h index 6d90ad974408..3503fe822e38 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -316,6 +316,9 @@ extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); } \ \ cmd; \ + \ + if (condition) \ + break; \ } \ finish_wait(&wq_head, &__wq_entry); \ __out: __ret; \ From cf3d0c54b21c4a351d4f94cf188e9715dbd1ef5b Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Fri, 7 Mar 2025 05:29:16 +0000 Subject: [PATCH 07/10] fs/pipe: Limit the slots in pipe_resize_ring() Limit the number of slots in pipe_resize_ring() to the maximum value representable by pipe->{head,tail}. Values beyond the max limit can lead to incorrect pipe occupancy related calculations where the pipe will never appear full. Suggested-by: Linus Torvalds Signed-off-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250307052919.34542-2-kprateek.nayak@amd.com Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- fs/pipe.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/pipe.c b/fs/pipe.c index 4d0799e4e719..88e81f84e3ea 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1271,6 +1271,10 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) struct pipe_buffer *bufs; unsigned int head, tail, mask, n; + /* nr_slots larger than limits of pipe->{head,tail} */ + if (unlikely(nr_slots > (pipe_index_t)-1u)) + return -EINVAL; + bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) From 547476063e123e73fc3aea8432104ab092ffea84 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Fri, 7 Mar 2025 05:29:17 +0000 Subject: [PATCH 08/10] kernel/watch_queue: Use pipe_buf() to retrieve the pipe buffer Use pipe_buf() helper to retrieve the pipe buffer in post_one_notification() replacing the open-coded the logic. Suggested-by: Oleg Nesterov Signed-off-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250307052919.34542-3-kprateek.nayak@amd.com Signed-off-by: Christian Brauner --- kernel/watch_queue.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 5267adeaa403..605129eb61a1 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -101,12 +101,11 @@ static bool post_one_notification(struct watch_queue *wqueue, struct pipe_inode_info *pipe = wqueue->pipe; struct pipe_buffer *buf; struct page *page; - unsigned int head, tail, mask, note, offset, len; + unsigned int head, tail, note, offset, len; bool done = false; spin_lock_irq(&pipe->rd_wait.lock); - mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; if (pipe_full(head, tail, pipe->ring_size)) @@ -124,7 +123,7 @@ static bool post_one_notification(struct watch_queue *wqueue, memcpy(p + offset, n, len); kunmap_atomic(p); - buf = &pipe->bufs[head & mask]; + buf = pipe_buf(pipe, head); buf->page = page; buf->private = (unsigned long)wqueue; buf->ops = &watch_queue_pipe_buf_ops; @@ -147,7 +146,7 @@ static bool post_one_notification(struct watch_queue *wqueue, return done; lost: - buf = &pipe->bufs[(head - 1) & mask]; + buf = pipe_buf(pipe, head - 1); buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } From ba0822021c3c5aa8029c16078c7091d35b4979bc Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Fri, 7 Mar 2025 05:29:18 +0000 Subject: [PATCH 09/10] fs/pipe: Use pipe_buf() helper to retrieve pipe buffer Use pipe_buf() helper to retrieve the pipe buffer throughout the file replacing the open-coded the logic. Suggested-by: Oleg Nesterov Signed-off-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250307052919.34542-4-kprateek.nayak@amd.com Signed-off-by: Christian Brauner --- fs/pipe.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 88e81f84e3ea..4d6ca0f892b1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -274,7 +274,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) /* Read ->head with a barrier vs post_one_notification() */ unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; #ifdef CONFIG_WATCH_QUEUE if (pipe->note_loss) { @@ -301,7 +300,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) #endif if (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t chars = buf->len; size_t written; int error; @@ -471,8 +470,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) was_empty = pipe_empty(head, pipe->tail); chars = total_len & (PAGE_SIZE-1); if (chars && !was_empty) { - unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, head - 1); int offset = buf->offset + buf->len; if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && @@ -503,7 +501,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) head = pipe->head; if (!pipe_full(head, pipe->tail, pipe->max_usage)) { - unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf; struct page *page = pipe->tmp_page; int copied; @@ -525,7 +522,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) pipe->head = head + 1; /* Insert it into the buffer array */ - buf = &pipe->bufs[head & mask]; + buf = pipe_buf(pipe, head); buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; From d5c6cb01b69c0a25f7a652627d217120fb6cad0d Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Fri, 7 Mar 2025 05:29:19 +0000 Subject: [PATCH 10/10] fs/splice: Use pipe_buf() helper to retrieve pipe buffer Use pipe_buf() helper to retrieve the pipe buffer throughout the file replacing the open-coded the logic. Suggested-by: Oleg Nesterov Signed-off-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250307052919.34542-5-kprateek.nayak@amd.com Signed-off-by: Christian Brauner --- fs/splice.c | 40 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index 23fa5561b944..90d464241f15 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -200,7 +200,6 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, unsigned int spd_pages = spd->nr_pages; unsigned int tail = pipe->tail; unsigned int head = pipe->head; - unsigned int mask = pipe->ring_size - 1; ssize_t ret = 0; int page_nr = 0; @@ -214,7 +213,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, } while (!pipe_full(head, tail, pipe->max_usage)) { - struct pipe_buffer *buf = &pipe->bufs[head & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, head); buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; @@ -247,7 +246,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; int ret; if (unlikely(!pipe->readers)) { @@ -256,7 +254,7 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) } else if (pipe_full(head, tail, pipe->max_usage)) { ret = -EAGAIN; } else { - pipe->bufs[head & mask] = *buf; + *pipe_buf(pipe, head) = *buf; pipe->head = head + 1; return buf->len; } @@ -447,11 +445,10 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des { unsigned int head = pipe->head; unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; int ret; while (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); sd->len = buf->len; if (sd->len > sd->total_len) @@ -495,8 +492,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) { unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); if (unlikely(!buf->len)) { pipe_buf_release(pipe, buf); @@ -690,7 +686,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, while (sd.total_len) { struct kiocb kiocb; struct iov_iter from; - unsigned int head, tail, mask; + unsigned int head, tail; size_t left; int n; @@ -711,12 +707,11 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; /* build the vector */ left = sd.total_len; for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t this_len = buf->len; /* zero-length bvecs are not supported, skip them */ @@ -752,7 +747,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, /* dismiss the fully eaten buffers, adjust the partial one */ tail = pipe->tail; while (ret) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); if (ret >= buf->len) { ret -= buf->len; buf->len = 0; @@ -809,7 +804,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, pipe_lock(pipe); while (len > 0) { - unsigned int head, tail, mask, bc = 0; + unsigned int head, tail, bc = 0; size_t remain = len; /* @@ -846,10 +841,9 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; while (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg; if (!buf->len) { @@ -894,7 +888,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, len -= ret; tail = pipe->tail; while (ret > 0) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg = min_t(size_t, ret, buf->len); buf->offset += seg; @@ -1725,7 +1719,6 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; - unsigned int i_mask, o_mask; int ret = 0; bool input_wakeup = false; @@ -1747,9 +1740,7 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; - i_mask = ipipe->ring_size - 1; o_head = opipe->head; - o_mask = opipe->ring_size - 1; do { size_t o_len; @@ -1792,8 +1783,8 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, goto retry; } - ibuf = &ipipe->bufs[i_tail & i_mask]; - obuf = &opipe->bufs[o_head & o_mask]; + ibuf = pipe_buf(ipipe, i_tail); + obuf = pipe_buf(opipe, o_head); if (len >= ibuf->len) { /* @@ -1862,7 +1853,6 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; - unsigned int i_mask, o_mask; ssize_t ret = 0; /* @@ -1873,9 +1863,7 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; - i_mask = ipipe->ring_size - 1; o_head = opipe->head; - o_mask = opipe->ring_size - 1; do { if (!opipe->readers) { @@ -1896,8 +1884,8 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, pipe_full(o_head, o_tail, opipe->max_usage)) break; - ibuf = &ipipe->bufs[i_tail & i_mask]; - obuf = &opipe->bufs[o_head & o_mask]; + ibuf = pipe_buf(ipipe, i_tail); + obuf = pipe_buf(opipe, o_head); /* * Get a reference to this pipe buffer,