From 0486b1832dc386c1adb6abef0afbed091d157cd9 Mon Sep 17 00:00:00 2001 From: Jiale Yang <295107659@qq.com> Date: Wed, 20 Nov 2024 16:30:28 +0800 Subject: [PATCH 01/25] fuse: change 'unsigned' to 'unsigned int' Prefer 'unsigned int' to bare 'unsigned', as reported by checkpatch.pl: WARNING: Prefer 'unsigned int' to bare use of 'unsigned'. Signed-off-by: Jiale Yang <295107659@qq.com> Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 4 ++-- fs/fuse/inode.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d56d4fd956db..f359efc3c2cf 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -74,8 +74,8 @@ extern struct list_head fuse_conn_list; extern struct mutex fuse_mutex; /** Module parameters */ -extern unsigned max_user_bgreq; -extern unsigned max_user_congthresh; +extern unsigned int max_user_bgreq; +extern unsigned int max_user_congthresh; /* One forget request */ struct fuse_forget_link { diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fd48e8d37f2e..9471cc57637f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -41,7 +41,7 @@ unsigned int fuse_max_pages_limit = 256; unsigned int fuse_default_req_timeout; unsigned int fuse_max_req_timeout; -unsigned max_user_bgreq; +unsigned int max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, &max_user_bgreq, 0644); __MODULE_PARM_TYPE(max_user_bgreq, "uint"); @@ -49,7 +49,7 @@ MODULE_PARM_DESC(max_user_bgreq, "Global limit for the maximum number of backgrounded requests an " "unprivileged user can set"); -unsigned max_user_congthresh; +unsigned int max_user_congthresh; module_param_call(max_user_congthresh, set_global_limit, param_get_uint, &max_user_congthresh, 0644); __MODULE_PARM_TYPE(max_user_congthresh, "uint"); @@ -1036,7 +1036,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_conn_get); -static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) +static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned int mode) { struct fuse_attr attr; memset(&attr, 0, sizeof(attr)); @@ -1211,7 +1211,7 @@ static const struct super_operations fuse_super_operations = { .show_options = fuse_show_options, }; -static void sanitize_global_limit(unsigned *limit) +static void sanitize_global_limit(unsigned int *limit) { /* * The default maximum number of async requests is calculated to consume @@ -1232,7 +1232,7 @@ static int set_global_limit(const char *val, const struct kernel_param *kp) if (rv) return rv; - sanitize_global_limit((unsigned *)kp->arg); + sanitize_global_limit((unsigned int *)kp->arg); return 0; } From faa794dd2e17e74cc0c8fdb6742dfb6ca3c182d0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 29 Jan 2025 10:17:56 -0800 Subject: [PATCH 02/25] fuse: Move prefaulting out of hot write path Prefaulting the write source buffer incurs an extra userspace access in the common fast path. Make fuse_fill_write_pages() consistent with generic_perform_write(): only touch userspace an extra time when copy_folio_from_iter_atomic() has failed to make progress. Signed-off-by: Dave Hansen Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 754378dd9f71..70dacbe5eae7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1243,10 +1243,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, bytes = min_t(size_t, bytes, fc->max_write - count); again: - err = -EFAULT; - if (fault_in_iov_iter_readable(ii, bytes)) - break; - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { @@ -1263,6 +1259,16 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, if (!tmp) { folio_unlock(folio); folio_put(folio); + + /* + * Ensure forward progress by faulting in + * while not holding the folio lock: + */ + if (fault_in_iov_iter_readable(ii, bytes)) { + err = -EFAULT; + break; + } + goto again; } From 2396356a945bb022aff02656f59c2a45d457043f Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Wed, 26 Feb 2025 09:14:51 +0000 Subject: [PATCH 03/25] fuse: add more control over cache invalidation behaviour Currently userspace is able to notify the kernel to invalidate the cache for an inode. This means that, if all the inodes in a filesystem need to be invalidated, then userspace needs to iterate through all of them and do this kernel notification separately. This patch adds the concept of 'epoch': each fuse connection will have the current epoch initialized and every new dentry will have it's d_time set to the current epoch value. A new operation will then allow userspace to increment the epoch value. Every time a dentry is d_revalidate()'ed, it's epoch is compared with the current connection epoch and invalidated if it's value is different. Signed-off-by: Luis Henriques Tested-by: Laura Promberger Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 16 ++++++++++++++++ fs/fuse/dir.c | 27 ++++++++++++++++++++++----- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 1 + fs/fuse/readdir.c | 3 +++ include/uapi/linux/fuse.h | 6 +++++- 6 files changed, 50 insertions(+), 6 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6dcbaa218b7a..00888befb5c8 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2021,6 +2021,19 @@ static int fuse_notify_resend(struct fuse_conn *fc) return 0; } +/* + * Increments the fuse connection epoch. This will result of dentries from + * previous epochs to be invalidated. + * + * XXX optimization: add call to shrink_dcache_sb()? + */ +static int fuse_notify_inc_epoch(struct fuse_conn *fc) +{ + atomic_inc(&fc->epoch); + + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -2049,6 +2062,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_RESEND: return fuse_notify_resend(fc); + case FUSE_NOTIFY_INC_EPOCH: + return fuse_notify_inc_epoch(fc); + default: fuse_copy_finish(cs); return -EINVAL; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 83ac192e7fdd..1fb0b15a6088 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -200,9 +200,14 @@ static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name, { struct inode *inode; struct fuse_mount *fm; + struct fuse_conn *fc; struct fuse_inode *fi; int ret; + fc = get_fuse_conn_super(dir->i_sb); + if (entry->d_time < atomic_read(&fc->epoch)) + goto invalid; + inode = d_inode_rcu(entry); if (inode && fuse_is_bad(inode)) goto invalid; @@ -415,16 +420,20 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) { - int err; struct fuse_entry_out outarg; + struct fuse_conn *fc; struct inode *inode; struct dentry *newent; + int err, epoch; bool outarg_valid = true; bool locked; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); + fc = get_fuse_conn_super(dir->i_sb); + epoch = atomic_read(&fc->epoch); + locked = fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); @@ -446,6 +455,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, goto out_err; entry = newent ? newent : entry; + entry->d_time = epoch; if (outarg_valid) fuse_change_entry_timeout(entry, &outarg); else @@ -619,7 +629,6 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, struct file *file, unsigned int flags, umode_t mode, u32 opcode) { - int err; struct inode *inode; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); @@ -629,11 +638,13 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; + int epoch, err; bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); + epoch = atomic_read(&fm->fc->epoch); forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) @@ -702,6 +713,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, } kfree(forget); d_instantiate(entry, inode); + entry->d_time = epoch; fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); err = generic_file_open(inode, file); @@ -788,12 +800,14 @@ static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_moun struct fuse_entry_out outarg; struct inode *inode; struct dentry *d; - int err; struct fuse_forget_link *forget; + int epoch, err; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); + epoch = atomic_read(&fm->fc->epoch); + forget = fuse_alloc_forget(); if (!forget) return ERR_PTR(-ENOMEM); @@ -835,10 +849,13 @@ static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_moun if (IS_ERR(d)) return d; - if (d) + if (d) { + d->d_time = epoch; fuse_change_entry_timeout(d, &outarg); - else + } else { + entry->d_time = epoch; fuse_change_entry_timeout(entry, &outarg); + } fuse_dir_changed(dir); return d; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f359efc3c2cf..74340e4eebe5 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -636,6 +636,9 @@ struct fuse_conn { /** Number of fuse_dev's */ atomic_t dev_count; + /** Current epoch for up-to-date dentries */ + atomic_t epoch; + struct rcu_head rcu; /** The user id for this mount */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9471cc57637f..bfe8d8af46f3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -962,6 +962,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); + atomic_set(&fc->epoch, 1); init_waitqueue_head(&fc->blocked_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 17ce9636a2b1..46b7146f2c0d 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -161,6 +161,7 @@ static int fuse_direntplus_link(struct file *file, struct fuse_conn *fc; struct inode *inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + int epoch; if (!o->nodeid) { /* @@ -190,6 +191,7 @@ static int fuse_direntplus_link(struct file *file, return -EIO; fc = get_fuse_conn(dir); + epoch = atomic_read(&fc->epoch); name.hash = full_name_hash(parent, name.name, name.len); dentry = d_lookup(parent, &name); @@ -256,6 +258,7 @@ static int fuse_direntplus_link(struct file *file, } if (fc->readdirplus_auto) set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); + dentry->d_time = epoch; fuse_change_entry_timeout(dentry, o); dput(dentry); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 5ec43ecbceb7..122d6586e8d4 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -232,6 +232,9 @@ * * 7.43 * - add FUSE_REQUEST_TIMEOUT + * + * 7.44 + * - add FUSE_NOTIFY_INC_EPOCH */ #ifndef _LINUX_FUSE_H @@ -267,7 +270,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 43 +#define FUSE_KERNEL_MINOR_VERSION 44 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -671,6 +674,7 @@ enum fuse_notify_code { FUSE_NOTIFY_RETRIEVE = 5, FUSE_NOTIFY_DELETE = 6, FUSE_NOTIFY_RESEND = 7, + FUSE_NOTIFY_INC_EPOCH = 8, FUSE_NOTIFY_CODE_MAX, }; From a5c4983bb90759c95d81f0d7b02000578077b1e6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 20 Feb 2025 12:16:58 -0800 Subject: [PATCH 04/25] fuse: Convert 'write' to a bit-field in struct fuse_copy_state Use a bitfield for 'write' in struct fuse_copy_state. No functional changes. Signed-off-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_dev_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index b3c2e32254ba..5e3f12664c1c 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -20,7 +20,6 @@ struct fuse_iqueue; struct fuse_forget_link; struct fuse_copy_state { - int write; struct fuse_req *req; struct iov_iter *iter; struct pipe_buffer *pipebufs; @@ -30,6 +29,7 @@ struct fuse_copy_state { struct page *pg; unsigned int len; unsigned int offset; + unsigned int write:1; unsigned int move_pages:1; unsigned int is_uring:1; struct { From 03a3617f92c2a7220af20d5b6b44420049dbe6e3 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 20 Feb 2025 12:16:59 -0800 Subject: [PATCH 05/25] fuse: use boolean bit-fields in struct fuse_copy_state Refactor struct fuse_copy_state to use boolean bit-fields to improve clarity/readability and be consistent with other fuse structs that use bit-fields for boolean state (eg fuse_fs_context, fuse_args). No functional changes. Signed-off-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 16 ++++++++-------- fs/fuse/dev_uring.c | 8 ++++---- fs/fuse/fuse_dev_i.h | 8 ++++---- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 00888befb5c8..c0b55bacecb1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -816,7 +816,7 @@ static int unlock_request(struct fuse_req *req) return err; } -void fuse_copy_init(struct fuse_copy_state *cs, int write, +void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter) { memset(cs, 0, sizeof(*cs)); @@ -1538,7 +1538,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) if (!user_backed_iter(to)) return -EINVAL; - fuse_copy_init(&cs, 1, to); + fuse_copy_init(&cs, true, to); return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to)); } @@ -1561,7 +1561,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (!bufs) return -ENOMEM; - fuse_copy_init(&cs, 1, NULL); + fuse_copy_init(&cs, true, NULL); cs.pipebufs = bufs; cs.pipe = pipe; ret = fuse_dev_do_read(fud, in, &cs, len); @@ -2038,7 +2038,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { /* Don't try to move pages (yet) */ - cs->move_pages = 0; + cs->move_pages = false; switch (code) { case FUSE_NOTIFY_POLL: @@ -2189,7 +2189,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, spin_unlock(&fpq->lock); cs->req = req; if (!req->args->page_replace) - cs->move_pages = 0; + cs->move_pages = false; if (oh.error) err = nbytes != sizeof(oh) ? -EINVAL : 0; @@ -2227,7 +2227,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) if (!user_backed_iter(from)) return -EINVAL; - fuse_copy_init(&cs, 0, from); + fuse_copy_init(&cs, false, from); return fuse_dev_do_write(fud, &cs, iov_iter_count(from)); } @@ -2301,13 +2301,13 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, } pipe_unlock(pipe); - fuse_copy_init(&cs, 0, NULL); + fuse_copy_init(&cs, false, NULL); cs.pipebufs = bufs; cs.nr_segs = nbuf; cs.pipe = pipe; if (flags & SPLICE_F_MOVE) - cs.move_pages = 1; + cs.move_pages = true; ret = fuse_dev_do_write(fud, &cs, len); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index accdce2977c5..cece9698fd68 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -577,8 +577,8 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, if (err) return err; - fuse_copy_init(&cs, 0, &iter); - cs.is_uring = 1; + fuse_copy_init(&cs, false, &iter); + cs.is_uring = true; cs.req = req; return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); @@ -607,8 +607,8 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, return err; } - fuse_copy_init(&cs, 1, &iter); - cs.is_uring = 1; + fuse_copy_init(&cs, true, &iter); + cs.is_uring = true; cs.req = req; if (num_args > 0) { diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 5e3f12664c1c..d2e16f73ac99 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -29,9 +29,9 @@ struct fuse_copy_state { struct page *pg; unsigned int len; unsigned int offset; - unsigned int write:1; - unsigned int move_pages:1; - unsigned int is_uring:1; + bool write:1; + bool move_pages:1; + bool is_uring:1; struct { unsigned int copied_sz; /* copied size into the user buffer */ } ring; @@ -51,7 +51,7 @@ struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); void fuse_dev_end_requests(struct list_head *head); -void fuse_copy_init(struct fuse_copy_state *cs, int write, +void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter); int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, unsigned int argpages, struct fuse_arg *args, From 4fea593e625cd50d4d11be227007849b12f17bfb Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 3 Feb 2025 11:30:22 -0800 Subject: [PATCH 06/25] fuse: optimize over-io-uring request expiration check Currently, when checking whether a request has timed out, we check fpq processing, but fuse-over-io-uring has one fpq per core and 256 entries in the processing table. For systems where there are a large number of cores, this may be too much overhead. Instead of checking the fpq processing list, check ent_w_req_queue and ent_in_userspace. Signed-off-by: Joanne Koong Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 +- fs/fuse/dev_uring.c | 26 +++++++++++++++++++++----- fs/fuse/fuse_dev_i.h | 1 - 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c0b55bacecb1..155bb6aeaef5 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -45,7 +45,7 @@ bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list) return time_is_before_jiffies(req->create_time + fc->timeout.req_timeout); } -bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing) +static bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing) { int i; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index cece9698fd68..249b210becb1 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -140,6 +140,21 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) } } +static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list) +{ + struct fuse_ring_ent *ent; + struct fuse_req *req; + + ent = list_first_entry_or_null(list, struct fuse_ring_ent, list); + if (!ent) + return false; + + req = ent->fuse_req; + + return time_is_before_jiffies(req->create_time + + fc->timeout.req_timeout); +} + bool fuse_uring_request_expired(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -157,7 +172,8 @@ bool fuse_uring_request_expired(struct fuse_conn *fc) spin_lock(&queue->lock); if (fuse_request_expired(fc, &queue->fuse_req_queue) || fuse_request_expired(fc, &queue->fuse_req_bg_queue) || - fuse_fpq_processing_expired(fc, queue->fpq.processing)) { + ent_list_request_expired(fc, &queue->ent_w_req_queue) || + ent_list_request_expired(fc, &queue->ent_in_userspace)) { spin_unlock(&queue->lock); return true; } @@ -494,7 +510,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, spin_lock(&queue->lock); if (ent->state == FRRS_AVAILABLE) { ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); + list_move_tail(&ent->list, &queue->ent_in_userspace); need_cmd_done = true; ent->cmd = NULL; } @@ -714,7 +730,7 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, cmd = ent->cmd; ent->cmd = NULL; ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); + list_move_tail(&ent->list, &queue->ent_in_userspace); spin_unlock(&queue->lock); io_uring_cmd_done(cmd, 0, 0, issue_flags); @@ -764,7 +780,7 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, clear_bit(FR_PENDING, &req->flags); ent->fuse_req = req; ent->state = FRRS_FUSE_REQ; - list_move(&ent->list, &queue->ent_w_req_queue); + list_move_tail(&ent->list, &queue->ent_w_req_queue); fuse_uring_add_to_pq(ent, req); } @@ -1180,7 +1196,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, spin_lock(&queue->lock); ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); + list_move_tail(&ent->list, &queue->ent_in_userspace); ent->cmd = NULL; spin_unlock(&queue->lock); diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index d2e16f73ac99..db136e045925 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -64,7 +64,6 @@ void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock); bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list); -bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing); #endif From 0c4f8ed498cea1e2beebf7aa3d198fa381264f88 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 14 Apr 2025 15:22:09 -0700 Subject: [PATCH 07/25] mm: skip folio reclaim in legacy memcg contexts for deadlockable mappings Currently in shrink_folio_list(), reclaim for folios under writeback falls into 3 different cases: 1) Reclaim is encountering an excessive number of folios under writeback and this folio has both the writeback and reclaim flags set 2) Dirty throttling is enabled (this happens if reclaim through cgroup is not enabled, if reclaim through cgroupv2 memcg is enabled, or if reclaim is on the root cgroup), or if the folio is not marked for immediate reclaim, or if the caller does not have __GFP_FS (or __GFP_IO if it's going to swap) set 3) Legacy cgroupv1 encounters a folio that already has the reclaim flag set and the caller did not have __GFP_FS (or __GFP_IO if swap) set In cases 1) and 2), we activate the folio and skip reclaiming it while in case 3), we wait for writeback to finish on the folio and then try to reclaim the folio again. In case 3, we wait on writeback because cgroupv1 does not have dirty folio throttling, as such this is a mitigation against the case where there are too many folios in writeback with nothing else to reclaim. If a filesystem (eg fuse) may deadlock due to reclaim waiting on writeback, then the filesystem needs to add inefficient messy workarounds to prevent this. To improve the performance of these filesystems, this commit adds two things: a) a AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM mapping flag that filesystems may set to indicate that reclaim should not wait on writeback b) if legacy memcg encounters a folio with this AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM flag set (eg case 3), the folio will be activated and skip reclaim (eg default to behavior in case 2) instead. Signed-off-by: Joanne Koong Reviewed-by: Shakeel Butt Reviewed-by: Jeff Layton Acked-by: David Hildenbrand Reviewed-by: Jingbo Xu Signed-off-by: Miklos Szeredi --- include/linux/pagemap.h | 11 +++++++++++ mm/vmscan.c | 12 +++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 26baa78f1ca7..e19ff3183bbf 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -210,6 +210,7 @@ enum mapping_flags { AS_STABLE_WRITES = 7, /* must wait for writeback before modifying folio contents */ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ + AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, @@ -335,6 +336,16 @@ static inline bool mapping_inaccessible(struct address_space *mapping) return test_bit(AS_INACCESSIBLE, &mapping->flags); } +static inline void mapping_set_writeback_may_deadlock_on_reclaim(struct address_space *mapping) +{ + set_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); +} + +static inline bool mapping_writeback_may_deadlock_on_reclaim(struct address_space *mapping) +{ + return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; diff --git a/mm/vmscan.c b/mm/vmscan.c index b620d74b0f66..8fc180fbd944 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1187,8 +1187,10 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, - * not to fs). In this case mark the folio for immediate - * reclaim and continue scanning. + * not to fs), or the folio belongs to a mapping where + * waiting on writeback during reclaim may lead to a deadlock. + * In this case mark the folio for immediate reclaim and + * continue scanning. * * Require may_enter_fs() because we would wait on fs, which * may not have submitted I/O yet. And the loop driver might @@ -1213,6 +1215,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, * takes to write them to disk. */ if (folio_test_writeback(folio)) { + mapping = folio_mapping(folio); + /* Case 1 above */ if (current_is_kswapd() && folio_test_reclaim(folio) && @@ -1223,7 +1227,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, /* Case 2 above */ } else if (writeback_throttling_sane(sc) || !folio_test_reclaim(folio) || - !may_enter_fs(folio, sc->gfp_mask)) { + !may_enter_fs(folio, sc->gfp_mask) || + (mapping && + mapping_writeback_may_deadlock_on_reclaim(mapping))) { /* * This is slightly racy - * folio_end_writeback() might have From 0c58a97f919c24fe4245015f4375a39ff05665b6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 14 Apr 2025 15:22:10 -0700 Subject: [PATCH 08/25] fuse: remove tmp folio for writebacks and internal rb tree In the current FUSE writeback design (see commit 3be5a52b30aa ("fuse: support writable mmap")), a temp page is allocated for every dirty page to be written back, the contents of the dirty page are copied over to the temp page, and the temp page gets handed to the server to write back. This is done so that writeback may be immediately cleared on the dirty page, and this in turn is done in order to mitigate the following deadlock scenario that may arise if reclaim waits on writeback on the dirty page to complete: * single-threaded FUSE server is in the middle of handling a request that needs a memory allocation * memory allocation triggers direct reclaim * direct reclaim waits on a folio under writeback * the FUSE server can't write back the folio since it's stuck in direct reclaim With a recent change that added AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM and mitigates the situation described above, FUSE writeback does not need to use temp pages if it sets AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM on its inode mappings. This commit sets AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM on the inode mappings and removes the temporary pages + extra copying and the internal rb tree. fio benchmarks -- (using averages observed from 10 runs, throwing away outliers) Setup: sudo mount -t tmpfs -o size=30G tmpfs ~/tmp_mount ./libfuse/build/example/passthrough_ll -o writeback -o max_threads=4 -o source=~/tmp_mount ~/fuse_mount fio --name=writeback --ioengine=sync --rw=write --bs={1k,4k,1M} --size=2G --numjobs=2 --ramp_time=30 --group_reporting=1 --directory=/root/fuse_mount bs = 1k 4k 1M Before 351 MiB/s 1818 MiB/s 1851 MiB/s After 341 MiB/s 2246 MiB/s 2685 MiB/s % diff -3% 23% 45% Signed-off-by: Joanne Koong Reviewed-by: Jingbo Xu Reviewed-by: Jeff Layton Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 364 ++++------------------------------------------- fs/fuse/fuse_i.h | 3 - 2 files changed, 28 insertions(+), 339 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 70dacbe5eae7..e203dd4fcc0f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -415,89 +415,11 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct rb_node writepages_entry; struct list_head queue_entry; - struct fuse_writepage_args *next; struct inode *inode; struct fuse_sync_bucket *bucket; }; -static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, - pgoff_t idx_from, pgoff_t idx_to) -{ - struct rb_node *n; - - n = fi->writepages.rb_node; - - while (n) { - struct fuse_writepage_args *wpa; - pgoff_t curr_index; - - wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); - WARN_ON(get_fuse_inode(wpa->inode) != fi); - curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_folios) - n = n->rb_right; - else if (idx_to < curr_index) - n = n->rb_left; - else - return wpa; - } - return NULL; -} - -/* - * Check if any page in a range is under writeback - */ -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, - pgoff_t idx_to) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - bool found; - - if (RB_EMPTY_ROOT(&fi->writepages)) - return false; - - spin_lock(&fi->lock); - found = fuse_find_writeback(fi, idx_from, idx_to); - spin_unlock(&fi->lock); - - return found; -} - -static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) -{ - return fuse_range_is_writeback(inode, index, index); -} - -/* - * Wait for page writeback to be completed. - * - * Since fuse doesn't rely on the VM writeback tracking, this has to - * use some other means. - */ -static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); -} - -static inline bool fuse_folio_is_writeback(struct inode *inode, - struct folio *folio) -{ - pgoff_t last = folio_next_index(folio) - 1; - return fuse_range_is_writeback(inode, folio_index(folio), last); -} - -static void fuse_wait_on_folio_writeback(struct inode *inode, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio)); -} - /* * Wait for all pending writepages on the inode to finish. * @@ -532,10 +454,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - inode_lock(inode); - fuse_sync_writes(inode); - inode_unlock(inode); - err = filemap_check_errors(file->f_mapping); if (err) return err; @@ -886,13 +804,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) ssize_t res; u64 attr_ver; - /* - * With the temporary pages that are used to complete writeback, we can - * have writeback that extends beyond the lifetime of the folio. So - * make sure we read a properly synced folio. - */ - fuse_wait_on_folio_writeback(inode, folio); - attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ @@ -1005,17 +916,12 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); unsigned int max_pages, nr_pages; - pgoff_t first = readahead_index(rac); - pgoff_t last = first + readahead_count(rac) - 1; if (fuse_is_bad(inode)) return; - wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); - max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); @@ -1181,7 +1087,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, int err; for (i = 0; i < ap->num_folios; i++) - fuse_wait_on_folio_writeback(inode, ap->folios[i]); + folio_wait_writeback(ap->folios[i]); fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); @@ -1644,7 +1550,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, return res; } } - if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) { if (!write) inode_lock(inode); fuse_sync_writes(inode); @@ -1841,38 +1747,34 @@ static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, static void fuse_writepage_free(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; - int i; if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_folios; i++) - folio_put(ap->folios[i]); - fuse_file_put(wpa->ia.ff, false); kfree(ap->folios); kfree(wpa); } -static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio) -{ - struct backing_dev_info *bdi = inode_to_bdi(inode); - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - node_stat_sub_folio(folio, NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); -} - static void fuse_writepage_finish(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); + struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) - fuse_writepage_finish_stat(inode, ap->folios[i]); + for (i = 0; i < ap->num_folios; i++) { + /* + * Benchmarks showed that ending writeback within the + * scope of the fi->lock alleviates xarray lock + * contention and noticeably improves performance. + */ + folio_end_writeback(ap->folios[i]); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + wb_writeout_inc(&bdi->wb); + } wake_up(&fi->page_waitq); } @@ -1883,7 +1785,6 @@ static void fuse_send_writepage(struct fuse_mount *fm, __releases(fi->lock) __acquires(fi->lock) { - struct fuse_writepage_args *aux, *next; struct fuse_inode *fi = get_fuse_inode(wpa->inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_args *args = &wpa->ia.ap.args; @@ -1920,19 +1821,8 @@ __acquires(fi->lock) out_free: fi->writectr--; - rb_erase(&wpa->writepages_entry, &fi->writepages); fuse_writepage_finish(wpa); spin_unlock(&fi->lock); - - /* After rb_erase() aux request list is private */ - for (aux = wpa->next; aux; aux = next) { - next = aux->next; - aux->next = NULL; - fuse_writepage_finish_stat(aux->inode, - aux->ia.ap.folios[0]); - fuse_writepage_free(aux); - } - fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1960,43 +1850,6 @@ __acquires(fi->lock) } } -static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, - struct fuse_writepage_args *wpa) -{ - pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1; - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - - WARN_ON(!wpa->ia.ap.num_folios); - while (*p) { - struct fuse_writepage_args *curr; - pgoff_t curr_index; - - parent = *p; - curr = rb_entry(parent, struct fuse_writepage_args, - writepages_entry); - WARN_ON(curr->inode != wpa->inode); - curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - - if (idx_from >= curr_index + curr->ia.ap.num_folios) - p = &(*p)->rb_right; - else if (idx_to < curr_index) - p = &(*p)->rb_left; - else - return curr; - } - - rb_link_node(&wpa->writepages_entry, parent, p); - rb_insert_color(&wpa->writepages_entry, root); - return NULL; -} - -static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) -{ - WARN_ON(fuse_insert_writeback(root, wpa)); -} - static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { @@ -2016,41 +1869,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, if (!fc->writeback_cache) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); - rb_erase(&wpa->writepages_entry, &fi->writepages); - while (wpa->next) { - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_writepage_args *next = wpa->next; - - wpa->next = next->next; - next->next = NULL; - tree_insert(&fi->writepages, next); - - /* - * Skip fuse_flush_writepages() to make it easy to crop requests - * based on primary request size. - * - * 1st case (trivial): there are no concurrent activities using - * fuse_set/release_nowrite. Then we're on safe side because - * fuse_flush_writepages() would call fuse_send_writepage() - * anyway. - * - * 2nd case: someone called fuse_set_nowrite and it is waiting - * now for completion of all in-flight requests. This happens - * rarely and no more than once per page, so this should be - * okay. - * - * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle - * of fuse_set_nowrite..fuse_release_nowrite section. The fact - * that fuse_set_nowrite returned implies that all in-flight - * requests were completed along with all of their secondary - * requests. Further primary requests are blocked by negative - * writectr. Hence there cannot be any in-flight requests and - * no invocations of fuse_writepage_end() while we're in - * fuse_set_nowrite..fuse_release_nowrite section. - */ - fuse_send_writepage(fm, next, inarg->offset + inarg->size); - } fi->writectr--; fuse_writepage_finish(wpa); spin_unlock(&fi->lock); @@ -2137,19 +1955,16 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, } static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, - struct folio *tmp_folio, uint32_t folio_index) + uint32_t folio_index) { struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; - folio_copy(tmp_folio, folio); - - ap->folios[folio_index] = tmp_folio; + ap->folios[folio_index] = folio; ap->descs[folio_index].offset = 0; ap->descs[folio_index].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, @@ -2184,18 +1999,12 @@ static int fuse_writepage_locked(struct folio *folio) struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - struct folio *tmp_folio; struct fuse_file *ff; - int error = -ENOMEM; + int error = -EIO; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto err; - - error = -EIO; ff = fuse_write_file_get(fi); if (!ff) - goto err_nofile; + goto err; wpa = fuse_writepage_args_setup(folio, ff); error = -ENOMEM; @@ -2206,22 +2015,17 @@ static int fuse_writepage_locked(struct folio *folio) ap->num_folios = 1; folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); + fuse_writepage_args_page_fill(wpa, folio, 0); spin_lock(&fi->lock); - tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - folio_end_writeback(folio); - return 0; err_writepage_args: fuse_file_put(ff, false); -err_nofile: - folio_put(tmp_folio); err: mapping_set_error(folio->mapping, error); return error; @@ -2231,7 +2035,6 @@ struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; - struct folio **orig_folios; unsigned int max_folios; }; @@ -2266,69 +2069,11 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_folios = wpa->ia.ap.num_folios; - int i; spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - - for (i = 0; i < num_folios; i++) - folio_end_writeback(data->orig_folios[i]); -} - -/* - * Check under fi->lock if the page is under writeback, and insert it onto the - * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's - * one already added for a page at this offset. If there's none, then insert - * this new request onto the auxiliary list, otherwise reuse the existing one by - * swapping the new temp page with the old one. - */ -static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); - struct fuse_writepage_args *tmp; - struct fuse_writepage_args *old_wpa; - struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - - WARN_ON(new_ap->num_folios != 0); - new_ap->num_folios = 1; - - spin_lock(&fi->lock); - old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); - if (!old_wpa) { - spin_unlock(&fi->lock); - return true; - } - - for (tmp = old_wpa->next; tmp; tmp = tmp->next) { - pgoff_t curr_index; - - WARN_ON(tmp->inode != new_wpa->inode); - curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == folio->index) { - WARN_ON(tmp->ia.ap.num_folios != 1); - swap(tmp->ia.ap.folios[0], new_ap->folios[0]); - break; - } - } - - if (!tmp) { - new_wpa->next = old_wpa->next; - old_wpa->next = new_wpa; - } - - spin_unlock(&fi->lock); - - if (tmp) { - fuse_writepage_finish_stat(new_wpa->inode, - folio); - fuse_writepage_free(new_wpa); - } - - return false; } static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, @@ -2337,15 +2082,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, { WARN_ON(!ap->num_folios); - /* - * Being under writeback is unlikely but possible. For example direct - * read to an mmaped fuse file will set the page dirty twice; once when - * the pages are faulted with get_user_pages(), and then after the read - * completed. - */ - if (fuse_folio_is_writeback(data->inode, folio)) - return true; - /* Reached max pages */ if (ap->num_folios == fc->max_pages) return true; @@ -2355,7 +2091,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, return true; /* Discontinuity */ - if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) + if (ap->folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) return true; /* Need to grow the pages array? If so, did the expansion fail? */ @@ -2374,7 +2110,6 @@ static int fuse_writepages_fill(struct folio *folio, struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct folio *tmp_folio; int err; if (!data->ff) { @@ -2389,54 +2124,23 @@ static int fuse_writepages_fill(struct folio *folio, data->wpa = NULL; } - err = -ENOMEM; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto out_unlock; - - /* - * The page must not be redirtied until the writeout is completed - * (i.e. userspace has sent a reply to the write request). Otherwise - * there could be more than one temporary page instance for each real - * page. - * - * This is ensured by holding the page lock in page_mkwrite() while - * checking fuse_page_is_writeback(). We already hold the page lock - * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_folios. - * After this fuse_page_is_writeback() will indicate that the page is - * under writeback, so we can release the page lock. - */ if (data->wpa == NULL) { err = -ENOMEM; wpa = fuse_writepage_args_setup(folio, data->ff); - if (!wpa) { - folio_put(tmp_folio); + if (!wpa) goto out_unlock; - } fuse_file_get(wpa->ia.ff); data->max_folios = 1; ap = &wpa->ia.ap; } folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios); - data->orig_folios[ap->num_folios] = folio; + fuse_writepage_args_page_fill(wpa, folio, ap->num_folios); err = 0; - if (data->wpa) { - /* - * Protected by fi->lock against concurrent access by - * fuse_page_is_writeback(). - */ - spin_lock(&fi->lock); - ap->num_folios++; - spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, folio)) { + ap->num_folios++; + if (!data->wpa) data->wpa = wpa; - } else { - folio_end_writeback(folio); - } out_unlock: folio_unlock(folio); @@ -2463,13 +2167,6 @@ static int fuse_writepages(struct address_space *mapping, data.wpa = NULL; data.ff = NULL; - err = -ENOMEM; - data.orig_folios = kcalloc(fc->max_pages, - sizeof(struct folio *), - GFP_NOFS); - if (!data.orig_folios) - goto out; - err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { WARN_ON(!data.wpa->ia.ap.num_folios); @@ -2478,7 +2175,6 @@ static int fuse_writepages(struct address_space *mapping, if (data.ff) fuse_file_put(data.ff, false); - kfree(data.orig_folios); out: return err; } @@ -2503,8 +2199,6 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, if (IS_ERR(folio)) goto error; - fuse_wait_on_page_writeback(mapping->host, folio->index); - if (folio_test_uptodate(folio) || len >= folio_size(folio)) goto success; /* @@ -2567,13 +2261,9 @@ static int fuse_launder_folio(struct folio *folio) { int err = 0; if (folio_clear_dirty_for_io(folio)) { - struct inode *inode = folio->mapping->host; - - /* Serialize with pending writeback for the same page */ - fuse_wait_on_page_writeback(inode, folio->index); err = fuse_writepage_locked(folio); if (!err) - fuse_wait_on_page_writeback(inode, folio->index); + folio_wait_writeback(folio); } return err; } @@ -2617,7 +2307,7 @@ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_NOPAGE; } - fuse_wait_on_folio_writeback(inode, folio); + folio_wait_writeback(folio); return VM_FAULT_LOCKED; } @@ -3435,9 +3125,12 @@ static const struct address_space_operations fuse_file_aops = { void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; + if (fc->writeback_cache) + mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); @@ -3445,7 +3138,6 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); init_waitqueue_head(&fi->direct_io_waitq); - fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 74340e4eebe5..b54f4f57789f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -161,9 +161,6 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; - - /* List of writepage requestst (pending or sent) */ - struct rb_root writepages; }; /* readdir cache (directory only) */ From 69efbff69f89c9b2b72c4d82ad8b59706add768a Mon Sep 17 00:00:00 2001 From: Guang Yuan Wu Date: Fri, 2 May 2025 04:04:21 +0000 Subject: [PATCH 09/25] fuse: fix race between concurrent setattrs from multiple nodes When mounting a user-space filesystem on multiple clients, after concurrent ->setattr() calls from different node, stale inode attributes may be cached in some node. This is caused by fuse_setattr() racing with fuse_reverse_inval_inode(). When filesystem server receives setattr request, the client node with valid iattr cached will be required to update the fuse_inode's attr_version and invalidate the cache by fuse_reverse_inval_inode(), and at the next call to ->getattr() they will be fetched from user space. The race scenario is: 1. client-1 sends setattr (iattr-1) request to server 2. client-1 receives the reply from server 3. before client-1 updates iattr-1 to the cached attributes by fuse_change_attributes_common(), server receives another setattr (iattr-2) request from client-2 4. server requests client-1 to update the inode attr_version and invalidate the cached iattr, and iattr-1 becomes staled 5. client-2 receives the reply from server, and caches iattr-2 6. continue with step 2, client-1 invokes fuse_change_attributes_common(), and caches iattr-1 The issue has been observed from concurrent of chmod, chown, or truncate, which all invoke ->setattr() call. The solution is to use fuse_inode's attr_version to check whether the attributes have been modified during the setattr request's lifetime. If so, mark the attributes as invalid in the function fuse_change_attributes_common(). Signed-off-by: Guang Yuan Wu Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1fb0b15a6088..b670f1da31e6 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1963,6 +1963,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int err; bool trust_local_cmtime = is_wb; bool fault_blocked = false; + u64 attr_version; if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; @@ -2047,6 +2048,8 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (fc->handle_killpriv_v2 && !capable(CAP_FSETID)) inarg.valid |= FATTR_KILL_SUIDGID; } + + attr_version = fuse_get_attr_version(fm->fc); fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); err = fuse_simple_request(fm, &args); if (err) { @@ -2072,6 +2075,14 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, /* FIXME: clear I_DIRTY_SYNC? */ } + if (fi->attr_version > attr_version) { + /* + * Apply attributes, for example for fsnotify_change(), but set + * attribute timeout to zero. + */ + outarg.attr_valid = outarg.attr_valid_nsec = 0; + } + fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), fuse_get_cache_mask(inode), 0); From 767c4b82715ad34b5b48159eba4e69608dd076e3 Mon Sep 17 00:00:00 2001 From: Chen Linxuan Date: Wed, 7 May 2025 16:42:16 +0800 Subject: [PATCH 10/25] MAINTAINERS: update filter of FUSE documentation There are some fuse-*.rst files in Documentation directory, let's make get_mantainers.pl work on those file instead of only fuse.rst. Reviewed-by: Amir Goldstein Signed-off-by: Chen Linxuan Signed-off-by: Miklos Szeredi --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 96b827049501..2174a854831f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9726,7 +9726,7 @@ L: linux-fsdevel@vger.kernel.org S: Maintained W: https://github.com/libfuse/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git -F: Documentation/filesystems/fuse.rst +F: Documentation/filesystems/fuse* F: fs/fuse/ F: include/uapi/linux/fuse.h From 18ee43c398af0b7e2eb2a4cd8469967834b0802d Mon Sep 17 00:00:00 2001 From: Chen Linxuan Date: Wed, 7 May 2025 16:42:17 +0800 Subject: [PATCH 11/25] docs: filesystems: add fuse-passthrough.rst Add a documentation about FUSE passthrough. It's mainly about why FUSE passthrough needs CAP_SYS_ADMIN. Link: https://lore.kernel.org/all/4b64a41c-6167-4c02-8bae-3021270ca519@fastmail.fm/T/#mc73e04df56b8830b1d7b06b5d9f22e594fba423e Link: https://lore.kernel.org/linux-fsdevel/CAOQ4uxhAY1m7ubJ3p-A3rSufw_53WuDRMT1Zqe_OC0bP_Fb3Zw@mail.gmail.com/ Reviewed-by: Amir Goldstein Signed-off-by: Chen Linxuan Signed-off-by: Miklos Szeredi --- .../filesystems/fuse-passthrough.rst | 133 ++++++++++++++++++ Documentation/filesystems/index.rst | 1 + 2 files changed, 134 insertions(+) create mode 100644 Documentation/filesystems/fuse-passthrough.rst diff --git a/Documentation/filesystems/fuse-passthrough.rst b/Documentation/filesystems/fuse-passthrough.rst new file mode 100644 index 000000000000..2b0e7c2da54a --- /dev/null +++ b/Documentation/filesystems/fuse-passthrough.rst @@ -0,0 +1,133 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================ +FUSE Passthrough +================ + +Introduction +============ + +FUSE (Filesystem in Userspace) passthrough is a feature designed to improve the +performance of FUSE filesystems for I/O operations. Typically, FUSE operations +involve communication between the kernel and a userspace FUSE daemon, which can +incur overhead. Passthrough allows certain operations on a FUSE file to bypass +the userspace daemon and be executed directly by the kernel on an underlying +"backing file". + +This is achieved by the FUSE daemon registering a file descriptor (pointing to +the backing file on a lower filesystem) with the FUSE kernel module. The kernel +then receives an identifier (``backing_id``) for this registered backing file. +When a FUSE file is subsequently opened, the FUSE daemon can, in its response to +the ``OPEN`` request, include this ``backing_id`` and set the +``FOPEN_PASSTHROUGH`` flag. This establishes a direct link for specific +operations. + +Currently, passthrough is supported for operations like ``read(2)``/``write(2)`` +(via ``read_iter``/``write_iter``), ``splice(2)``, and ``mmap(2)``. + +Enabling Passthrough +==================== + +To use FUSE passthrough: + + 1. The FUSE filesystem must be compiled with ``CONFIG_FUSE_PASSTHROUGH`` + enabled. + 2. The FUSE daemon, during the ``FUSE_INIT`` handshake, must negotiate the + ``FUSE_PASSTHROUGH`` capability and specify its desired + ``max_stack_depth``. + 3. The (privileged) FUSE daemon uses the ``FUSE_DEV_IOC_BACKING_OPEN`` ioctl + on its connection file descriptor (e.g., ``/dev/fuse``) to register a + backing file descriptor and obtain a ``backing_id``. + 4. When handling an ``OPEN`` or ``CREATE`` request for a FUSE file, the daemon + replies with the ``FOPEN_PASSTHROUGH`` flag set in + ``fuse_open_out::open_flags`` and provides the corresponding ``backing_id`` + in ``fuse_open_out::backing_id``. + 5. The FUSE daemon should eventually call ``FUSE_DEV_IOC_BACKING_CLOSE`` with + the ``backing_id`` to release the kernel's reference to the backing file + when it's no longer needed for passthrough setups. + +Privilege Requirements +====================== + +Setting up passthrough functionality currently requires the FUSE daemon to +possess the ``CAP_SYS_ADMIN`` capability. This requirement stems from several +security and resource management considerations that are actively being +discussed and worked on. The primary reasons for this restriction are detailed +below. + +Resource Accounting and Visibility +---------------------------------- + +The core mechanism for passthrough involves the FUSE daemon opening a file +descriptor to a backing file and registering it with the FUSE kernel module via +the ``FUSE_DEV_IOC_BACKING_OPEN`` ioctl. This ioctl returns a ``backing_id`` +associated with a kernel-internal ``struct fuse_backing`` object, which holds a +reference to the backing ``struct file``. + +A significant concern arises because the FUSE daemon can close its own file +descriptor to the backing file after registration. The kernel, however, will +still hold a reference to the ``struct file`` via the ``struct fuse_backing`` +object as long as it's associated with a ``backing_id`` (or subsequently, with +an open FUSE file in passthrough mode). + +This behavior leads to two main issues for unprivileged FUSE daemons: + + 1. **Invisibility to lsof and other inspection tools**: Once the FUSE + daemon closes its file descriptor, the open backing file held by the kernel + becomes "hidden." Standard tools like ``lsof``, which typically inspect + process file descriptor tables, would not be able to identify that this + file is still open by the system on behalf of the FUSE filesystem. This + makes it difficult for system administrators to track resource usage or + debug issues related to open files (e.g., preventing unmounts). + + 2. **Bypassing RLIMIT_NOFILE**: The FUSE daemon process is subject to + resource limits, including the maximum number of open file descriptors + (``RLIMIT_NOFILE``). If an unprivileged daemon could register backing files + and then close its own FDs, it could potentially cause the kernel to hold + an unlimited number of open ``struct file`` references without these being + accounted against the daemon's ``RLIMIT_NOFILE``. This could lead to a + denial-of-service (DoS) by exhausting system-wide file resources. + +The ``CAP_SYS_ADMIN`` requirement acts as a safeguard against these issues, +restricting this powerful capability to trusted processes. + +**NOTE**: ``io_uring`` solves this similar issue by exposing its "fixed files", +which are visible via ``fdinfo`` and accounted under the registering user's +``RLIMIT_NOFILE``. + +Filesystem Stacking and Shutdown Loops +-------------------------------------- + +Another concern relates to the potential for creating complex and problematic +filesystem stacking scenarios if unprivileged users could set up passthrough. +A FUSE passthrough filesystem might use a backing file that resides: + + * On the *same* FUSE filesystem. + * On another filesystem (like OverlayFS) which itself might have an upper or + lower layer that is a FUSE filesystem. + +These configurations could create dependency loops, particularly during +filesystem shutdown or unmount sequences, leading to deadlocks or system +instability. This is conceptually similar to the risks associated with the +``LOOP_SET_FD`` ioctl, which also requires ``CAP_SYS_ADMIN``. + +To mitigate this, FUSE passthrough already incorporates checks based on +filesystem stacking depth (``sb->s_stack_depth`` and ``fc->max_stack_depth``). +For example, during the ``FUSE_INIT`` handshake, the FUSE daemon can negotiate +the ``max_stack_depth`` it supports. When a backing file is registered via +``FUSE_DEV_IOC_BACKING_OPEN``, the kernel checks if the backing file's +filesystem stack depth is within the allowed limit. + +The ``CAP_SYS_ADMIN`` requirement provides an additional layer of security, +ensuring that only privileged users can create these potentially complex +stacking arrangements. + +General Security Posture +------------------------ + +As a general principle for new kernel features that allow userspace to instruct +the kernel to perform direct operations on its behalf based on user-provided +file descriptors, starting with a higher privilege requirement (like +``CAP_SYS_ADMIN``) is a conservative and common security practice. This allows +the feature to be used and tested while further security implications are +evaluated and addressed. diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index a9cf8e950b15..2913f4f2e00c 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -99,6 +99,7 @@ Documentation for filesystem implementations. fuse fuse-io fuse-io-uring + fuse-passthrough inotify isofs nilfs2 From f09222980d775199de2f5d739cf453f7bf39aa4a Mon Sep 17 00:00:00 2001 From: Chen Linxuan Date: Tue, 13 May 2025 12:20:49 +0800 Subject: [PATCH 12/25] fs: fuse: add dev id to /dev/fuse fdinfo This commit add fuse connection device id to fdinfo of opened /dev/fuse files. Related discussions can be found at links below. Link: https://lore.kernel.org/all/CAJfpegvEYUgEbpATpQx8NqVR33Mv-VK96C+gbTag1CEUeBqvnA@mail.gmail.com/ Signed-off-by: Chen Linxuan Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 155bb6aeaef5..07e08362d5b9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -23,6 +23,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "fuse_trace.h" @@ -2618,6 +2619,17 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, } } +#ifdef CONFIG_PROC_FS +static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) +{ + struct fuse_dev *fud = fuse_get_dev(file); + if (!fud) + return; + + seq_printf(seq, "fuse_connection:\t%u\n", fud->fc->dev); +} +#endif + const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .open = fuse_dev_open, @@ -2633,6 +2645,9 @@ const struct file_operations fuse_dev_operations = { #ifdef CONFIG_FUSE_IO_URING .uring_cmd = fuse_uring_cmd, #endif +#ifdef CONFIG_PROC_FS + .show_fdinfo = fuse_dev_show_fdinfo, +#endif }; EXPORT_SYMBOL_GPL(fuse_dev_operations); From 394244b24fdd09cbbe0290494073ad95a547b44f Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 12 May 2025 15:58:30 -0700 Subject: [PATCH 13/25] fuse: support copying large folios Currently, all folios associated with fuse are one page size. As part of the work to enable large folios, this commit adds support for copying to/from folios larger than one page size. Signed-off-by: Joanne Koong Reviewed-by: Jeff Layton Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 97 ++++++++++++++++++++++---------------------- fs/fuse/fuse_dev_i.h | 2 +- 2 files changed, 49 insertions(+), 50 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 07e08362d5b9..f8fd24df173c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -956,10 +956,10 @@ static int fuse_check_folio(struct folio *folio) * folio that was originally in @pagep will lose a reference and the new * folio returned in @pagep will carry a reference. */ -static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) +static int fuse_try_move_folio(struct fuse_copy_state *cs, struct folio **foliop) { int err; - struct folio *oldfolio = page_folio(*pagep); + struct folio *oldfolio = *foliop; struct folio *newfolio; struct pipe_buffer *buf = cs->pipebufs; @@ -980,7 +980,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) cs->pipebufs++; cs->nr_segs--; - if (cs->len != PAGE_SIZE) + if (cs->len != folio_size(oldfolio)) goto out_fallback; if (!pipe_buf_try_steal(cs->pipe, buf)) @@ -1026,7 +1026,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else - *pagep = &newfolio->page; + *foliop = newfolio; spin_unlock(&cs->req->waitq.lock); if (err) { @@ -1059,8 +1059,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) goto out_put_old; } -static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, - unsigned offset, unsigned count) +static int fuse_ref_folio(struct fuse_copy_state *cs, struct folio *folio, + unsigned offset, unsigned count) { struct pipe_buffer *buf; int err; @@ -1068,17 +1068,17 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; - get_page(page); + folio_get(folio); err = unlock_request(cs->req); if (err) { - put_page(page); + folio_put(folio); return err; } fuse_copy_finish(cs); buf = cs->pipebufs; - buf->page = page; + buf->page = &folio->page; buf->offset = offset; buf->len = count; @@ -1090,20 +1090,24 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, } /* - * Copy a page in the request to/from the userspace buffer. Must be + * Copy a folio in the request to/from the userspace buffer. Must be * done atomically */ -static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, - unsigned offset, unsigned count, int zeroing) +static int fuse_copy_folio(struct fuse_copy_state *cs, struct folio **foliop, + unsigned offset, unsigned count, int zeroing) { int err; - struct page *page = *pagep; + struct folio *folio = *foliop; + size_t size; - if (page && zeroing && count < PAGE_SIZE) - clear_highpage(page); + if (folio) { + size = folio_size(folio); + if (zeroing && count < size) + folio_zero_range(folio, 0, size); + } while (count) { - if (cs->write && cs->pipebufs && page) { + if (cs->write && cs->pipebufs && folio) { /* * Can't control lifetime of pipe buffers, so always * copy user pages. @@ -1113,12 +1117,12 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, if (err) return err; } else { - return fuse_ref_page(cs, page, offset, count); + return fuse_ref_folio(cs, folio, offset, count); } } else if (!cs->len) { - if (cs->move_pages && page && - offset == 0 && count == PAGE_SIZE) { - err = fuse_try_move_page(cs, pagep); + if (cs->move_folios && folio && + offset == 0 && count == size) { + err = fuse_try_move_folio(cs, foliop); if (err <= 0) return err; } else { @@ -1127,22 +1131,30 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, return err; } } - if (page) { - void *mapaddr = kmap_local_page(page); - void *buf = mapaddr + offset; - offset += fuse_copy_do(cs, &buf, &count); + if (folio) { + void *mapaddr = kmap_local_folio(folio, offset); + void *buf = mapaddr; + unsigned int copy = count; + unsigned int bytes_copied; + + if (folio_test_highmem(folio) && count > PAGE_SIZE - offset_in_page(offset)) + copy = PAGE_SIZE - offset_in_page(offset); + + bytes_copied = fuse_copy_do(cs, &buf, ©); kunmap_local(mapaddr); + offset += bytes_copied; + count -= bytes_copied; } else offset += fuse_copy_do(cs, NULL, &count); } - if (page && !cs->write) - flush_dcache_page(page); + if (folio && !cs->write) + flush_dcache_folio(folio); return 0; } -/* Copy pages in the request to/from userspace buffer */ -static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, - int zeroing) +/* Copy folios in the request to/from userspace buffer */ +static int fuse_copy_folios(struct fuse_copy_state *cs, unsigned nbytes, + int zeroing) { unsigned i; struct fuse_req *req = cs->req; @@ -1152,23 +1164,12 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, int err; unsigned int offset = ap->descs[i].offset; unsigned int count = min(nbytes, ap->descs[i].length); - struct page *orig, *pagep; - orig = pagep = &ap->folios[i]->page; - - err = fuse_copy_page(cs, &pagep, offset, count, zeroing); + err = fuse_copy_folio(cs, &ap->folios[i], offset, count, zeroing); if (err) return err; nbytes -= count; - - /* - * fuse_copy_page may have moved a page from a pipe instead of - * copying into our given page, so update the folios if it was - * replaced. - */ - if (pagep != orig) - ap->folios[i] = page_folio(pagep); } return 0; } @@ -1198,7 +1199,7 @@ int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, for (i = 0; !err && i < numargs; i++) { struct fuse_arg *arg = &args[i]; if (i == numargs - 1 && argpages) - err = fuse_copy_pages(cs, arg->size, zeroing); + err = fuse_copy_folios(cs, arg->size, zeroing); else err = fuse_copy_one(cs, arg->value, arg->size); } @@ -1787,7 +1788,6 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, num = outarg.size; while (num) { struct folio *folio; - struct page *page; unsigned int this_num; folio = filemap_grab_folio(mapping, index); @@ -1795,9 +1795,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, if (IS_ERR(folio)) goto out_iput; - page = &folio->page; this_num = min_t(unsigned, num, folio_size(folio) - offset); - err = fuse_copy_page(cs, &page, offset, this_num, 0); + err = fuse_copy_folio(cs, &folio, offset, this_num, 0); if (!folio_test_uptodate(folio) && !err && offset == 0 && (this_num == folio_size(folio) || file_size == end)) { folio_zero_segment(folio, this_num, folio_size(folio)); @@ -2038,8 +2037,8 @@ static int fuse_notify_inc_epoch(struct fuse_conn *fc) static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { - /* Don't try to move pages (yet) */ - cs->move_pages = false; + /* Don't try to move folios (yet) */ + cs->move_folios = false; switch (code) { case FUSE_NOTIFY_POLL: @@ -2190,7 +2189,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, spin_unlock(&fpq->lock); cs->req = req; if (!req->args->page_replace) - cs->move_pages = false; + cs->move_folios = false; if (oh.error) err = nbytes != sizeof(oh) ? -EINVAL : 0; @@ -2308,7 +2307,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, cs.pipe = pipe; if (flags & SPLICE_F_MOVE) - cs.move_pages = true; + cs.move_folios = true; ret = fuse_dev_do_write(fud, &cs, len); diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index db136e045925..5a9bd771a319 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -30,7 +30,7 @@ struct fuse_copy_state { unsigned int len; unsigned int offset; bool write:1; - bool move_pages:1; + bool move_folios:1; bool is_uring:1; struct { unsigned int copied_sz; /* copied size into the user buffer */ From 3568a956932621cafadafc8b75fcf6dc06555105 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 12 May 2025 15:58:31 -0700 Subject: [PATCH 14/25] fuse: support large folios for retrieves Add support for folios larger than one page size for retrieves. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f8fd24df173c..b7d0a81c5df1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1849,7 +1849,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; - unsigned int num_pages, cur_pages = 0; + unsigned int num_pages; struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); @@ -1867,6 +1867,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); + num = min(num, num_pages << PAGE_SHIFT); args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->descs[0])); @@ -1887,25 +1888,29 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; - while (num && cur_pages < num_pages) { + while (num) { struct folio *folio; - unsigned int this_num; + unsigned int folio_offset; + unsigned int nr_bytes; + unsigned int nr_pages; folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) break; - this_num = min_t(unsigned, num, PAGE_SIZE - offset); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + nr_bytes = min(folio_size(folio) - folio_offset, num); + nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + ap->folios[ap->num_folios] = folio; - ap->descs[ap->num_folios].offset = offset; - ap->descs[ap->num_folios].length = this_num; + ap->descs[ap->num_folios].offset = folio_offset; + ap->descs[ap->num_folios].length = nr_bytes; ap->num_folios++; - cur_pages++; offset = 0; - num -= this_num; - total_len += this_num; - index++; + num -= nr_bytes; + total_len += nr_bytes; + index += nr_pages; } ra->inarg.offset = outarg->offset; ra->inarg.size = total_len; From 63c69ad3d18a8032d50c65de5a3b87136f0cfb42 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 12 May 2025 15:58:32 -0700 Subject: [PATCH 15/25] fuse: refactor fuse_fill_write_pages() Refactor the logic in fuse_fill_write_pages() for copying out write data. This will make the future change for supporting large folios for writes easier. No functional changes. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e203dd4fcc0f..6b77daa2fbce 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1132,21 +1132,21 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); - unsigned int nr_pages = 0; size_t count = 0; - int err; + unsigned int num; + int err = 0; + + num = min(iov_iter_count(ii), fc->max_write); + num = min(num, max_pages << PAGE_SHIFT); ap->args.in_pages = true; ap->descs[0].offset = offset; - do { + while (num) { size_t tmp; struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; - size_t bytes = min_t(size_t, PAGE_SIZE - offset, - iov_iter_count(ii)); - - bytes = min_t(size_t, bytes, fc->max_write - count); + unsigned bytes = min(PAGE_SIZE - offset, num); again: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, @@ -1178,14 +1178,13 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, goto again; } - err = 0; ap->folios[ap->num_folios] = folio; ap->descs[ap->num_folios].length = tmp; ap->num_folios++; - nr_pages++; count += tmp; pos += tmp; + num -= tmp; offset += tmp; if (offset == PAGE_SIZE) offset = 0; @@ -1200,10 +1199,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, ia->write.folio_locked = true; break; } - if (!fc->big_writes) + if (!fc->big_writes || offset != 0) break; - } while (iov_iter_count(ii) && count < fc->max_write && - nr_pages < max_pages && offset == 0); + } return count > 0 ? count : err; } From d60a6015e1a2848fc04237bd37ac65b204772313 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 12 May 2025 15:58:33 -0700 Subject: [PATCH 16/25] fuse: support large folios for writethrough writes Add support for folios larger than one page size for writethrough writes. Signed-off-by: Joanne Koong Reviewed-by: Jeff Layton Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 6b77daa2fbce..2d9bc484e87a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1146,7 +1146,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, size_t tmp; struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; - unsigned bytes = min(PAGE_SIZE - offset, num); + unsigned int bytes; + unsigned int folio_offset; again: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, @@ -1159,7 +1160,10 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); - tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + bytes = min(folio_size(folio) - folio_offset, num); + + tmp = copy_folio_from_iter_atomic(folio, folio_offset, bytes, ii); flush_dcache_folio(folio); if (!tmp) { @@ -1179,6 +1183,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, } ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = folio_offset; ap->descs[ap->num_folios].length = tmp; ap->num_folios++; @@ -1186,11 +1191,11 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, pos += tmp; num -= tmp; offset += tmp; - if (offset == PAGE_SIZE) + if (offset == folio_size(folio)) offset = 0; - /* If we copied full page, mark it uptodate */ - if (tmp == PAGE_SIZE) + /* If we copied full folio, mark it uptodate */ + if (tmp == folio_size(folio)) folio_mark_uptodate(folio); if (folio_test_uptodate(folio)) { From 351a24eb48209b50e575a28a0abe07d551187ca8 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 12 May 2025 15:58:34 -0700 Subject: [PATCH 17/25] fuse: support large folios for folio reads Add support for folios larger than one page size for folio reads into the page cache. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2d9bc484e87a..8efdca3ce566 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -793,7 +793,7 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) struct inode *inode = folio->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); loff_t pos = folio_pos(folio); - struct fuse_folio_desc desc = { .length = PAGE_SIZE }; + struct fuse_folio_desc desc = { .length = folio_size(folio) }; struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, From cacc0645bcad3e20ad17e5942944c51cbe13297e Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 12 May 2025 15:58:35 -0700 Subject: [PATCH 18/25] fuse: support large folios for symlinks Support large folios for symlinks and change the name from fuse_getlink_page() to fuse_getlink_folio(). Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b670f1da31e6..a3d78aebbd0a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1629,10 +1629,10 @@ static int fuse_permission(struct mnt_idmap *idmap, return err; } -static int fuse_readlink_page(struct inode *inode, struct folio *folio) +static int fuse_readlink_folio(struct inode *inode, struct folio *folio) { struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_folio_desc desc = { .length = folio_size(folio) - 1 }; struct fuse_args_pages ap = { .num_folios = 1, .folios = &folio, @@ -1687,7 +1687,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, if (!folio) goto out_err; - err = fuse_readlink_page(inode, folio); + err = fuse_readlink_folio(inode, folio); if (err) { folio_put(folio); goto out_err; @@ -2288,7 +2288,7 @@ void fuse_init_dir(struct inode *inode) static int fuse_symlink_read_folio(struct file *null, struct folio *folio) { - int err = fuse_readlink_page(folio->mapping->host, folio); + int err = fuse_readlink_folio(folio->mapping->host, folio); if (!err) folio_mark_uptodate(folio); From c91440c89fbd9d7d23430e5b4cb9e2fd779cc078 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 12 May 2025 15:58:36 -0700 Subject: [PATCH 19/25] fuse: support large folios for stores Add support for folios larger than one page size for stores. Also change variable naming from "this_num" to "nr_bytes". Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b7d0a81c5df1..e80cd8f2c049 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1788,18 +1788,23 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, num = outarg.size; while (num) { struct folio *folio; - unsigned int this_num; + unsigned int folio_offset; + unsigned int nr_bytes; + unsigned int nr_pages; folio = filemap_grab_folio(mapping, index); err = PTR_ERR(folio); if (IS_ERR(folio)) goto out_iput; - this_num = min_t(unsigned, num, folio_size(folio) - offset); - err = fuse_copy_folio(cs, &folio, offset, this_num, 0); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + nr_bytes = min_t(unsigned, num, folio_size(folio) - folio_offset); + nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + + err = fuse_copy_folio(cs, &folio, folio_offset, nr_bytes, 0); if (!folio_test_uptodate(folio) && !err && offset == 0 && - (this_num == folio_size(folio) || file_size == end)) { - folio_zero_segment(folio, this_num, folio_size(folio)); + (nr_bytes == folio_size(folio) || file_size == end)) { + folio_zero_segment(folio, nr_bytes, folio_size(folio)); folio_mark_uptodate(folio); } folio_unlock(folio); @@ -1808,9 +1813,9 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, if (err) goto out_iput; - num -= this_num; + num -= nr_bytes; offset = 0; - index++; + index += nr_pages; } err = 0; From ff7c3ee4842d87f8fad000039d87692eb03e31e7 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 12 May 2025 15:58:37 -0700 Subject: [PATCH 20/25] fuse: support large folios for queued writes Add support for folios larger than one page size for queued writes. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8efdca3ce566..f221a45b4bad 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1789,11 +1789,14 @@ __releases(fi->lock) __acquires(fi->lock) { struct fuse_inode *fi = get_fuse_inode(wpa->inode); + struct fuse_args_pages *ap = &wpa->ia.ap; struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_args *args = &wpa->ia.ap.args; - /* Currently, all folios in FUSE are one page */ - __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE; - int err; + struct fuse_args *args = &ap->args; + __u64 data_size = 0; + int err, i; + + for (i = 0; i < ap->num_folios; i++) + data_size += ap->descs[i].length; fi->writectr++; if (inarg->offset + data_size <= size) { From 906354c87f4917210aa4400708906200ae71614c Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 12 May 2025 15:58:38 -0700 Subject: [PATCH 21/25] fuse: support large folios for readahead Add support for folios larger than one page size for readahead. Signed-off-by: Joanne Koong Reviewed-by: Jeff Layton Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f221a45b4bad..07ff81469a59 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -876,14 +876,13 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_io_free(ia); } -static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, + unsigned int count) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; loff_t pos = folio_pos(ap->folios[0]); - /* Currently, all folios in FUSE are one page */ - size_t count = ap->num_folios << PAGE_SHIFT; ssize_t res; int err; @@ -918,6 +917,7 @@ static void fuse_readahead(struct readahead_control *rac) struct inode *inode = rac->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); unsigned int max_pages, nr_pages; + struct folio *folio = NULL; if (fuse_is_bad(inode)) return; @@ -939,8 +939,8 @@ static void fuse_readahead(struct readahead_control *rac) while (nr_pages) { struct fuse_io_args *ia; struct fuse_args_pages *ap; - struct folio *folio; unsigned cur_pages = min(max_pages, nr_pages); + unsigned int pages = 0; if (fc->num_background >= fc->congestion_threshold && rac->ra->async_size >= readahead_count(rac)) @@ -952,10 +952,12 @@ static void fuse_readahead(struct readahead_control *rac) ia = fuse_io_alloc(NULL, cur_pages); if (!ia) - return; + break; ap = &ia->ap; - while (ap->num_folios < cur_pages) { + while (pages < cur_pages) { + unsigned int folio_pages; + /* * This returns a folio with a ref held on it. * The ref needs to be held until the request is @@ -963,13 +965,31 @@ static void fuse_readahead(struct readahead_control *rac) * fuse_try_move_page()) drops the ref after it's * replaced in the page cache. */ - folio = __readahead_folio(rac); + if (!folio) + folio = __readahead_folio(rac); + + folio_pages = folio_nr_pages(folio); + if (folio_pages > cur_pages - pages) { + /* + * Large folios belonging to fuse will never + * have more pages than max_pages. + */ + WARN_ON(!pages); + break; + } + ap->folios[ap->num_folios] = folio; ap->descs[ap->num_folios].length = folio_size(folio); ap->num_folios++; + pages += folio_pages; + folio = NULL; } - fuse_send_readpages(ia, rac->file); - nr_pages -= cur_pages; + fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT); + nr_pages -= pages; + } + if (folio) { + folio_end_read(folio, false); + folio_put(folio); } } From f3cb8bd908c72eb7b766eca05aac1685749eff02 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 12 May 2025 15:58:40 -0700 Subject: [PATCH 22/25] fuse: support large folios for writeback Add support for folios larger than one page size for writeback. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 07ff81469a59..3d0b33be3824 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1988,7 +1988,7 @@ static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struc ap->folios[folio_index] = folio; ap->descs[folio_index].offset = 0; - ap->descs[folio_index].length = PAGE_SIZE; + ap->descs[folio_index].length = folio_size(folio); inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); } @@ -2062,6 +2062,7 @@ struct fuse_fill_wb_data { struct fuse_file *ff; struct inode *inode; unsigned int max_folios; + unsigned int nr_pages; }; static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) @@ -2109,15 +2110,15 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, WARN_ON(!ap->num_folios); /* Reached max pages */ - if (ap->num_folios == fc->max_pages) + if (data->nr_pages + folio_nr_pages(folio) > fc->max_pages) return true; /* Reached max write bytes */ - if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write) + if ((data->nr_pages * PAGE_SIZE) + folio_size(folio) > fc->max_write) return true; /* Discontinuity */ - if (ap->folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) + if (folio_next_index(ap->folios[ap->num_folios - 1]) != folio_index(folio)) return true; /* Need to grow the pages array? If so, did the expansion fail? */ @@ -2148,6 +2149,7 @@ static int fuse_writepages_fill(struct folio *folio, if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; + data->nr_pages = 0; } if (data->wpa == NULL) { @@ -2162,6 +2164,7 @@ static int fuse_writepages_fill(struct folio *folio, folio_start_writeback(folio); fuse_writepage_args_page_fill(wpa, folio, ap->num_folios); + data->nr_pages += folio_nr_pages(folio); err = 0; ap->num_folios++; @@ -2192,6 +2195,7 @@ static int fuse_writepages(struct address_space *mapping, data.inode = inode; data.wpa = NULL; data.ff = NULL; + data.nr_pages = 0; err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { From c31f91c6af96a5eb0632f4aee8d4e39cad7d7559 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 16 Apr 2025 10:53:58 +0200 Subject: [PATCH 23/25] fuse: don't allow signals to interrupt getdents copying When getting the directory contents, the entries are first fetched to a kernel buffer, then they are copied to userspace with dir_emit(). This second phase is non-blocking as long as the userspace buffer is not paged out, making it interruptible makes zero sense. Overload d_type as flags, since it only uses 4 bits from 32. Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/readdir.c | 4 ++-- fs/readdir.c | 18 +++++++++++++++--- include/linux/fs.h | 3 +++ 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 46b7146f2c0d..59defad9d05e 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -120,7 +120,7 @@ static bool fuse_emit(struct file *file, struct dir_context *ctx, fuse_add_dirent_to_cache(file, dirent, ctx->pos); return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, - dirent->type); + dirent->type | FILLDIR_FLAG_NOINTR); } static int parse_dirfile(char *buf, size_t nbytes, struct file *file, @@ -422,7 +422,7 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, if (ff->readdir.pos == ctx->pos) { res = FOUND_SOME; if (!dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type)) + dirent->ino, dirent->type | FILLDIR_FLAG_NOINTR)) return FOUND_ALL; ctx->pos = dirent->off; } diff --git a/fs/readdir.c b/fs/readdir.c index 0038efda417b..857d402bc531 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -266,6 +266,10 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, sizeof(long)); int prev_reclen; + unsigned int flags = d_type; + + BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK); + d_type &= S_DT_MASK; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) @@ -279,7 +283,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen, return false; } prev_reclen = buf->prev_reclen; - if (prev_reclen && signal_pending(current)) + if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; @@ -351,6 +355,10 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); int prev_reclen; + unsigned int flags = d_type; + + BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK); + d_type &= S_DT_MASK; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) @@ -359,7 +367,7 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen, if (reclen > buf->count) return false; prev_reclen = buf->prev_reclen; - if (prev_reclen && signal_pending(current)) + if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *)dirent - prev_reclen; @@ -513,6 +521,10 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + namlen + 2, sizeof(compat_long_t)); int prev_reclen; + unsigned int flags = d_type; + + BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK); + d_type &= S_DT_MASK; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) @@ -526,7 +538,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen return false; } prev_reclen = buf->prev_reclen; - if (prev_reclen && signal_pending(current)) + if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..0f2a1a572e3a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2073,6 +2073,9 @@ struct dir_context { loff_t pos; }; +/* If OR-ed with d_type, pending signals are not checked */ +#define FILLDIR_FLAG_NOINTR 0x1000 + /* * These flags let !MMU mmap() govern direct device mapping vs immediate * copying more easily for MAP_PRIVATE, especially for ROM filesystems. From 467e245d47e6662a4cbf4184d9e6d0b1b120c0bf Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 16 Apr 2025 12:44:55 +0200 Subject: [PATCH 24/25] readdir: supply dir_context.count as readdir buffer size hint This is a preparation for large readdir buffers in fuse. Simply setting the fuse buffer size to the userspace buffer size should work, the record sizes are similar (fuse's is slightly larger than libc's, so no overflow should ever happen). Signed-off-by: Miklos Szeredi Signed-off-by: Jaco Kroon --- fs/exportfs/expfs.c | 1 + fs/overlayfs/readdir.c | 12 +++++++++++- fs/readdir.c | 29 ++++++++++++++--------------- include/linux/fs.h | 7 +++++++ 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 128dd092916b..48f0c505b50f 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -284,6 +284,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child) }; struct getdents_callback buffer = { .ctx.actor = filldir_one, + .ctx.count = INT_MAX, .name = name, }; diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 881ec5592da5..126c797751e9 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -351,6 +351,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, struct path realpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, + .ctx.count = INT_MAX, .dentry = dentry, .list = list, .root = root, @@ -571,6 +572,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list, struct ovl_cache_entry *p, *n; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, + .ctx.count = INT_MAX, .list = list, .root = root, }; @@ -672,6 +674,7 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name, struct ovl_readdir_translate *rdt = container_of(ctx, struct ovl_readdir_translate, ctx); struct dir_context *orig_ctx = rdt->orig_ctx; + bool res; if (rdt->parent_ino && strcmp(name, "..") == 0) { ino = rdt->parent_ino; @@ -686,7 +689,10 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name, name, namelen, rdt->xinowarn); } - return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); + res = orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); + ctx->count = orig_ctx->count; + + return res; } static bool ovl_is_impure_dir(struct file *file) @@ -713,6 +719,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) const struct ovl_layer *lower_layer = ovl_layer_lower(dir); struct ovl_readdir_translate rdt = { .ctx.actor = ovl_fill_real, + .ctx.count = ctx->count, .orig_ctx = ctx, .xinobits = ovl_xino_bits(ofs), .xinowarn = ovl_xino_warn(ofs), @@ -1073,6 +1080,7 @@ int ovl_check_d_type_supported(const struct path *realpath) int err; struct ovl_readdir_data rdd = { .ctx.actor = ovl_check_d_type, + .ctx.count = INT_MAX, .d_type_supported = false, }; @@ -1094,6 +1102,7 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, + .ctx.count = INT_MAX, .list = &list, }; bool incompat = false; @@ -1178,6 +1187,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, + .ctx.count = INT_MAX, .list = &list, }; diff --git a/fs/readdir.c b/fs/readdir.c index 857d402bc531..7764b8638978 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -222,6 +222,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd, CLASS(fd_pos, f)(fd); struct readdir_callback buf = { .ctx.actor = fillonedir, + .ctx.count = 1, /* Hint to fs: just one entry. */ .dirent = dirent }; @@ -252,7 +253,6 @@ struct getdents_callback { struct dir_context ctx; struct linux_dirent __user * current_dir; int prev_reclen; - int count; int error; }; @@ -275,7 +275,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen, if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ - if (reclen > buf->count) + if (reclen > ctx->count) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { @@ -300,7 +300,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen, buf->current_dir = (void __user *)dirent + reclen; buf->prev_reclen = reclen; - buf->count -= reclen; + ctx->count -= reclen; return true; efault_end: user_write_access_end(); @@ -315,7 +315,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, CLASS(fd_pos, f)(fd); struct getdents_callback buf = { .ctx.actor = filldir, - .count = count, + .ctx.count = count, .current_dir = dirent }; int error; @@ -333,7 +333,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else - error = count - buf.count; + error = count - buf.ctx.count; } return error; } @@ -342,7 +342,6 @@ struct getdents_callback64 { struct dir_context ctx; struct linux_dirent64 __user * current_dir; int prev_reclen; - int count; int error; }; @@ -364,7 +363,7 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen, if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ - if (reclen > buf->count) + if (reclen > ctx->count) return false; prev_reclen = buf->prev_reclen; if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current)) @@ -384,7 +383,7 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen, buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; - buf->count -= reclen; + ctx->count -= reclen; return true; efault_end: @@ -400,7 +399,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, CLASS(fd_pos, f)(fd); struct getdents_callback64 buf = { .ctx.actor = filldir64, - .count = count, + .ctx.count = count, .current_dir = dirent }; int error; @@ -419,7 +418,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, if (put_user(d_off, &lastdirent->d_off)) error = -EFAULT; else - error = count - buf.count; + error = count - buf.ctx.count; } return error; } @@ -483,6 +482,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, CLASS(fd_pos, f)(fd); struct compat_readdir_callback buf = { .ctx.actor = compat_fillonedir, + .ctx.count = 1, /* Hint to fs: just one entry. */ .dirent = dirent }; @@ -507,7 +507,6 @@ struct compat_getdents_callback { struct dir_context ctx; struct compat_linux_dirent __user *current_dir; int prev_reclen; - int count; int error; }; @@ -530,7 +529,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ - if (reclen > buf->count) + if (reclen > ctx->count) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { @@ -554,7 +553,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; - buf->count -= reclen; + ctx->count -= reclen; return true; efault_end: user_write_access_end(); @@ -569,8 +568,8 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, CLASS(fd_pos, f)(fd); struct compat_getdents_callback buf = { .ctx.actor = compat_filldir, + .ctx.count = count, .current_dir = dirent, - .count = count }; int error; @@ -587,7 +586,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else - error = count - buf.count; + error = count - buf.ctx.count; } return error; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 0f2a1a572e3a..dfc5a3327124 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2071,6 +2071,13 @@ typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, struct dir_context { filldir_t actor; loff_t pos; + /* + * Filesystems MUST NOT MODIFY count, but may use as a hint: + * 0 unknown + * > 0 space in buffer (assume at least one entry) + * INT_MAX unlimited + */ + int count; }; /* If OR-ed with d_type, pending signals are not checked */ From dabb90391028799fb524680feb4eb96eb904ca6f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 8 Apr 2025 15:14:20 +0200 Subject: [PATCH 25/25] fuse: increase readdir buffer size Increase the buffer size to the count requested by userspace. This improves performance. Signed-off-by: Miklos Szeredi Signed-off-by: Bernd Schubert --- fs/fuse/readdir.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 59defad9d05e..c2aae2eef086 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -335,35 +335,32 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { int plus; ssize_t res; - struct folio *folio; struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; struct fuse_io_args ia = {}; - struct fuse_args_pages *ap = &ia.ap; - struct fuse_folio_desc desc = { .length = PAGE_SIZE }; + struct fuse_args *args = &ia.ap.args; + void *buf; + size_t bufsize = clamp((unsigned int) ctx->count, PAGE_SIZE, fc->max_pages << PAGE_SHIFT); u64 attr_version = 0, evict_ctr = 0; bool locked; - folio = folio_alloc(GFP_KERNEL, 0); - if (!folio) + buf = kvmalloc(bufsize, GFP_KERNEL); + if (!buf) return -ENOMEM; + args->out_args[0].value = buf; + plus = fuse_use_readdirplus(inode, ctx); - ap->args.out_pages = true; - ap->num_folios = 1; - ap->folios = &folio; - ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); evict_ctr = fuse_get_evict_ctr(fm->fc); - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); + fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIRPLUS); } else { - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); + fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIR); } locked = fuse_lock_inode(inode); - res = fuse_simple_request(fm, &ap->args); + res = fuse_simple_request(fm, args); fuse_unlock_inode(inode, locked); if (res >= 0) { if (!res) { @@ -372,16 +369,14 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - res = parse_dirplusfile(folio_address(folio), res, - file, ctx, attr_version, + res = parse_dirplusfile(buf, res, file, ctx, attr_version, evict_ctr); } else { - res = parse_dirfile(folio_address(folio), res, file, - ctx); + res = parse_dirfile(buf, res, file, ctx); } } - folio_put(folio); + kvfree(buf); fuse_invalidate_atime(inode); return res; }