From bd5603eaae0aabf527bfb3ce1bb07e979ce5bd50 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 10 Oct 2025 15:07:38 -0700 Subject: [PATCH 01/15] fuse: fix readahead reclaim deadlock Commit e26ee4efbc79 ("fuse: allocate ff->release_args only if release is needed") skips allocating ff->release_args if the server does not implement open. However in doing so, fuse_prepare_release() now skips grabbing the reference on the inode, which makes it possible for an inode to be evicted from the dcache while there are inflight readahead requests. This causes a deadlock if the server triggers reclaim while servicing the readahead request and reclaim attempts to evict the inode of the file being read ahead. Since the folio is locked during readahead, when reclaim evicts the fuse inode and fuse_evict_inode() attempts to remove all folios associated with the inode from the page cache (truncate_inode_pages_range()), reclaim will block forever waiting for the lock since readahead cannot relinquish the lock because it is itself blocked in reclaim: >>> stack_trace(1504735) folio_wait_bit_common (mm/filemap.c:1308:4) folio_lock (./include/linux/pagemap.h:1052:3) truncate_inode_pages_range (mm/truncate.c:336:10) fuse_evict_inode (fs/fuse/inode.c:161:2) evict (fs/inode.c:704:3) dentry_unlink_inode (fs/dcache.c:412:3) __dentry_kill (fs/dcache.c:615:3) shrink_kill (fs/dcache.c:1060:12) shrink_dentry_list (fs/dcache.c:1087:3) prune_dcache_sb (fs/dcache.c:1168:2) super_cache_scan (fs/super.c:221:10) do_shrink_slab (mm/shrinker.c:435:9) shrink_slab (mm/shrinker.c:626:10) shrink_node (mm/vmscan.c:5951:2) shrink_zones (mm/vmscan.c:6195:3) do_try_to_free_pages (mm/vmscan.c:6257:3) do_swap_page (mm/memory.c:4136:11) handle_pte_fault (mm/memory.c:5562:10) handle_mm_fault (mm/memory.c:5870:9) do_user_addr_fault (arch/x86/mm/fault.c:1338:10) handle_page_fault (arch/x86/mm/fault.c:1481:3) exc_page_fault (arch/x86/mm/fault.c:1539:2) asm_exc_page_fault+0x22/0x27 Fix this deadlock by allocating ff->release_args and grabbing the reference on the inode when preparing the file for release even if the server does not implement open. The inode reference will be dropped when the last reference on the fuse file is dropped (see fuse_file_put() -> fuse_release_end()). Fixes: e26ee4efbc79 ("fuse: allocate ff->release_args only if release is needed") Cc: stable@vger.kernel.org Signed-off-by: Joanne Koong Reported-by: Omar Sandoval Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f1ef77a0be05..4d96e684d736 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -110,7 +110,9 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) fuse_file_io_release(ff, ra->inode); if (!args) { - /* Do nothing when server does not implement 'open' */ + /* Do nothing when server does not implement 'opendir' */ + } else if (args->opcode == FUSE_RELEASE && ff->fm->fc->no_open) { + fuse_release_end(ff->fm, args, 0); } else if (sync) { fuse_simple_request(ff->fm, args); fuse_release_end(ff->fm, args, 0); @@ -131,8 +133,17 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; bool open = isdir ? !fc->no_opendir : !fc->no_open; + bool release = !isdir || open; - ff = fuse_file_alloc(fm, open); + /* + * ff->args->release_args still needs to be allocated (so we can hold an + * inode reference while there are pending inflight file operations when + * ->release() is called, see fuse_prepare_release()) even if + * fc->no_open is set else it becomes possible for reclaim to deadlock + * if while servicing the readahead request the server triggers reclaim + * and reclaim evicts the inode of the file being read ahead. + */ + ff = fuse_file_alloc(fm, release); if (!ff) return ERR_PTR(-ENOMEM); @@ -152,13 +163,14 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, fuse_file_free(ff); return ERR_PTR(err); } else { - /* No release needed */ - kfree(ff->args); - ff->args = NULL; - if (isdir) + if (isdir) { + /* No release needed */ + kfree(ff->args); + ff->args = NULL; fc->no_opendir = 1; - else + } else { fc->no_open = 1; + } } } From 6e0d7f7f4a43ac8868e98c87ecf48805aa8c24dd Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Tue, 21 Oct 2025 22:46:42 +0200 Subject: [PATCH 02/15] fuse: missing copy_finish in fuse-over-io-uring argument copies Fix a possible reference count leak of payload pages during fuse argument copies. [Joanne: simplified error cleanup] Fixes: c090c8abae4b ("fuse: Add io-uring sqe commit and fetch support") Cc: stable@vger.kernel.org # v6.14 Signed-off-by: Cheng Ding Signed-off-by: Bernd Schubert Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 +- fs/fuse/dev_uring.c | 5 ++++- fs/fuse/fuse_dev_i.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 132f38619d70..49b18d7accb3 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -846,7 +846,7 @@ void fuse_copy_init(struct fuse_copy_state *cs, bool write, } /* Unmap and put previous page of userspace buffer */ -static void fuse_copy_finish(struct fuse_copy_state *cs) +void fuse_copy_finish(struct fuse_copy_state *cs) { if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index f6b12aebb8bb..ed34676703e3 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -598,7 +598,9 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, cs.is_uring = true; cs.req = req; - return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); + err = fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); + fuse_copy_finish(&cs); + return err; } /* @@ -649,6 +651,7 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, /* copy the payload */ err = fuse_copy_args(&cs, num_args, args->in_pages, (struct fuse_arg *)in_args, 0); + fuse_copy_finish(&cs); if (err) { pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); return err; diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 6e8373f97040..134bf44aff0d 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -62,6 +62,7 @@ void fuse_dev_end_requests(struct list_head *head); void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter); +void fuse_copy_finish(struct fuse_copy_state *cs); int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, unsigned int argpages, struct fuse_arg *args, int zeroing); From 2a36511609cc7c4817c0998d4651f8c188a6db18 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 21 Oct 2025 22:46:43 +0200 Subject: [PATCH 03/15] fuse: Fix whitespace for fuse_uring_args_to_ring() comment The function comment accidentally got wrong indentation. Signed-off-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/dev_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index ed34676703e3..0066c9c0a5d5 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -603,9 +603,9 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, return err; } - /* - * Copy data from the req to the ring buffer - */ +/* + * Copy data from the req to the ring buffer + */ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, struct fuse_ring_ent *ent) { From 66c6a77e00a2f28330cca90c67339111cd54e54b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 3 Nov 2025 14:10:38 +0100 Subject: [PATCH 04/15] fuse: add WARN_ON and comment for RCU revalidate Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 43ee4c7296a7..a153f449e2ad 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -128,9 +128,17 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry, unsigned int i; int ret = 1; - /* Careful in RCU mode */ - if (!inode) + if (!inode) { + /* + * Lookup of negative dentries will call ovl_dentry_init_flags() + * with NULL upperdentry and NULL oe, resulting in the + * DCACHE_OP*_REVALIDATE flags being cleared. Hence the only + * way to get a negative inode is due to a race with dentry + * destruction. + */ + WARN_ON(!(flags & LOOKUP_RCU)); return -ECHILD; + } oe = OVL_I_E(inode); lowerstack = ovl_lowerstack(oe); From 395b95530343e7f4bdd2870190d985a222997fb6 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 16 Sep 2025 14:53:07 +0100 Subject: [PATCH 05/15] dcache: export shrink_dentry_list() and add new helper d_dispose_if_unused() Add and export a new helper d_dispose_if_unused() which is simply a wrapper around to_shrink_list(), to add an entry to a dispose list if it's not used anymore. Also export shrink_dentry_list() to kill all dentries in a dispose list. Suggested-by: Miklos Szeredi Signed-off-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/dcache.c | 18 ++++++++++++------ include/linux/dcache.h | 2 ++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 035cccbc9276..bffb1b47a907 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1086,6 +1086,15 @@ struct dentry *d_find_alias_rcu(struct inode *inode) return de; } +void d_dispose_if_unused(struct dentry *dentry, struct list_head *dispose) +{ + spin_lock(&dentry->d_lock); + if (!dentry->d_lockref.count) + to_shrink_list(dentry, dispose); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_dispose_if_unused); + /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. @@ -1096,12 +1105,8 @@ void d_prune_aliases(struct inode *inode) struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { - spin_lock(&dentry->d_lock); - if (!dentry->d_lockref.count) - to_shrink_list(dentry, &dispose); - spin_unlock(&dentry->d_lock); - } + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) + d_dispose_if_unused(dentry, &dispose); spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); } @@ -1141,6 +1146,7 @@ void shrink_dentry_list(struct list_head *list) shrink_kill(dentry); } } +EXPORT_SYMBOL(shrink_dentry_list); static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c83e02b94389..2bc1339bf6d0 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -268,6 +268,8 @@ extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); +extern void d_dispose_if_unused(struct dentry *, struct list_head *); +extern void shrink_dentry_list(struct list_head *); extern struct dentry *d_find_alias_rcu(struct inode *); From ab84ad5973869a660ca3ad0c54a2b84d975d47c4 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 16 Sep 2025 14:53:08 +0100 Subject: [PATCH 06/15] fuse: new work queue to periodically invalidate expired dentries This patch adds the necessary infrastructure to keep track of all dentries created for FUSE file systems. A set of rbtrees, protected by hashed locks, will be used to keep all these dentries sorted by expiry time. A new module parameter 'inval_wq' is also added. When set, it will start a work queue which will periodically invalidate expired dentries. The value of this new parameter is the period, in seconds, for this work queue. Once this parameter is set, every new dentry will be added to one of the rbtrees. When the work queue is executed, it will check all the rbtrees and will invalidate those dentries that have timed-out. The work queue period can not be smaller than 5 seconds, but can be disabled by setting 'inval_wq' to zero (which is the default). Signed-off-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 224 +++++++++++++++++++++++++++++++++++++++++------ fs/fuse/fuse_i.h | 10 +++ fs/fuse/inode.c | 3 + 3 files changed, 212 insertions(+), 25 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ecaec0fea3a1..77982fdbcf27 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -27,6 +27,67 @@ module_param(allow_sys_admin_access, bool, 0644); MODULE_PARM_DESC(allow_sys_admin_access, "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check"); +struct dentry_bucket { + struct rb_root tree; + spinlock_t lock; +}; + +#define HASH_BITS 5 +#define HASH_SIZE (1 << HASH_BITS) +static struct dentry_bucket dentry_hash[HASH_SIZE]; +struct delayed_work dentry_tree_work; + +/* Minimum invalidation work queue frequency */ +#define FUSE_DENTRY_INVAL_FREQ_MIN 5 + +unsigned __read_mostly inval_wq; +static int inval_wq_set(const char *val, const struct kernel_param *kp) +{ + unsigned int num; + unsigned int old = inval_wq; + int ret; + + if (!val) + return -EINVAL; + + ret = kstrtouint(val, 0, &num); + if (ret) + return ret; + + if ((num < FUSE_DENTRY_INVAL_FREQ_MIN) && (num != 0)) + return -EINVAL; + + /* This should prevent overflow in secs_to_jiffies() */ + if (num > USHRT_MAX) + return -EINVAL; + + *((unsigned int *)kp->arg) = num; + + if (num && !old) + schedule_delayed_work(&dentry_tree_work, + secs_to_jiffies(num)); + else if (!num && old) + cancel_delayed_work_sync(&dentry_tree_work); + + return 0; +} +static const struct kernel_param_ops inval_wq_ops = { + .set = inval_wq_set, + .get = param_get_uint, +}; +module_param_cb(inval_wq, &inval_wq_ops, &inval_wq, 0644); +__MODULE_PARM_TYPE(inval_wq, "uint"); +MODULE_PARM_DESC(inval_wq, + "Dentries invalidation work queue period in secs (>= " + __stringify(FUSE_DENTRY_INVAL_FREQ_MIN) ")."); + +static inline struct dentry_bucket *get_dentry_bucket(struct dentry *dentry) +{ + int i = hash_ptr(dentry, HASH_BITS); + + return &dentry_hash[i]; +} + static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); @@ -34,33 +95,131 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } -#if BITS_PER_LONG >= 64 -static inline void __fuse_dentry_settime(struct dentry *entry, u64 time) -{ - entry->d_fsdata = (void *) time; -} - -static inline u64 fuse_dentry_time(const struct dentry *entry) -{ - return (u64)entry->d_fsdata; -} - -#else -union fuse_dentry { +struct fuse_dentry { u64 time; - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct rb_node node; + }; + struct dentry *dentry; }; +static void __fuse_dentry_tree_del_node(struct fuse_dentry *fd, + struct dentry_bucket *bucket) +{ + if (!RB_EMPTY_NODE(&fd->node)) { + rb_erase(&fd->node, &bucket->tree); + RB_CLEAR_NODE(&fd->node); + } +} + +static void fuse_dentry_tree_del_node(struct dentry *dentry) +{ + struct fuse_dentry *fd = dentry->d_fsdata; + struct dentry_bucket *bucket = get_dentry_bucket(dentry); + + spin_lock(&bucket->lock); + __fuse_dentry_tree_del_node(fd, bucket); + spin_unlock(&bucket->lock); +} + +static void fuse_dentry_tree_add_node(struct dentry *dentry) +{ + struct fuse_dentry *fd = dentry->d_fsdata; + struct dentry_bucket *bucket; + struct fuse_dentry *cur; + struct rb_node **p, *parent = NULL; + + if (!inval_wq) + return; + + bucket = get_dentry_bucket(dentry); + + spin_lock(&bucket->lock); + + __fuse_dentry_tree_del_node(fd, bucket); + + p = &bucket->tree.rb_node; + while (*p) { + parent = *p; + cur = rb_entry(*p, struct fuse_dentry, node); + if (fd->time < cur->time) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&fd->node, parent, p); + rb_insert_color(&fd->node, &bucket->tree); + spin_unlock(&bucket->lock); +} + +/* + * work queue which, when enabled, will periodically check for expired dentries + * in the dentries tree. + */ +static void fuse_dentry_tree_work(struct work_struct *work) +{ + LIST_HEAD(dispose); + struct fuse_dentry *fd; + struct rb_node *node; + int i; + + for (i = 0; i < HASH_SIZE; i++) { + spin_lock(&dentry_hash[i].lock); + node = rb_first(&dentry_hash[i].tree); + while (node) { + fd = rb_entry(node, struct fuse_dentry, node); + if (time_after64(get_jiffies_64(), fd->time)) { + rb_erase(&fd->node, &dentry_hash[i].tree); + RB_CLEAR_NODE(&fd->node); + spin_unlock(&dentry_hash[i].lock); + d_dispose_if_unused(fd->dentry, &dispose); + cond_resched(); + spin_lock(&dentry_hash[i].lock); + } else + break; + node = rb_first(&dentry_hash[i].tree); + } + spin_unlock(&dentry_hash[i].lock); + shrink_dentry_list(&dispose); + } + + if (inval_wq) + schedule_delayed_work(&dentry_tree_work, + secs_to_jiffies(inval_wq)); +} + +void fuse_dentry_tree_init(void) +{ + int i; + + for (i = 0; i < HASH_SIZE; i++) { + spin_lock_init(&dentry_hash[i].lock); + dentry_hash[i].tree = RB_ROOT; + } + INIT_DELAYED_WORK(&dentry_tree_work, fuse_dentry_tree_work); +} + +void fuse_dentry_tree_cleanup(void) +{ + int i; + + inval_wq = 0; + cancel_delayed_work_sync(&dentry_tree_work); + + for (i = 0; i < HASH_SIZE; i++) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&dentry_hash[i].tree)); +} + static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time) { - ((union fuse_dentry *) dentry->d_fsdata)->time = time; + ((struct fuse_dentry *) dentry->d_fsdata)->time = time; } static inline u64 fuse_dentry_time(const struct dentry *entry) { - return ((union fuse_dentry *) entry->d_fsdata)->time; + return ((struct fuse_dentry *) entry->d_fsdata)->time; } -#endif static void fuse_dentry_settime(struct dentry *dentry, u64 time) { @@ -81,6 +240,7 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 time) } __fuse_dentry_settime(dentry, time); + fuse_dentry_tree_add_node(dentry); } /* @@ -283,21 +443,36 @@ static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name, goto out; } -#if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { - dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), - GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); + struct fuse_dentry *fd; - return dentry->d_fsdata ? 0 : -ENOMEM; + fd = kzalloc(sizeof(struct fuse_dentry), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); + if (!fd) + return -ENOMEM; + + fd->dentry = dentry; + RB_CLEAR_NODE(&fd->node); + dentry->d_fsdata = fd; + + return 0; } + +static void fuse_dentry_prune(struct dentry *dentry) +{ + struct fuse_dentry *fd = dentry->d_fsdata; + + if (!RB_EMPTY_NODE(&fd->node)) + fuse_dentry_tree_del_node(dentry); +} + static void fuse_dentry_release(struct dentry *dentry) { - union fuse_dentry *fd = dentry->d_fsdata; + struct fuse_dentry *fd = dentry->d_fsdata; kfree_rcu(fd, rcu); } -#endif static int fuse_dentry_delete(const struct dentry *dentry) { @@ -331,10 +506,9 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_delete = fuse_dentry_delete, -#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, + .d_prune = fuse_dentry_prune, .d_release = fuse_dentry_release, -#endif .d_automount = fuse_dentry_automount, }; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c2f2a48156d6..ac717b3b46a1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -54,6 +54,13 @@ /** Frequency (in jiffies) of request timeout checks, if opted into */ extern const unsigned long fuse_timeout_timer_freq; +/* + * Dentries invalidation workqueue period, in seconds. The value of this + * parameter shall be >= FUSE_DENTRY_INVAL_FREQ_MIN seconds, or 0 (zero), in + * which case no workqueue will be created. + */ +extern unsigned inval_wq __read_mostly; + /** Maximum of max_pages received in init_out */ extern unsigned int fuse_max_pages_limit; /* @@ -1277,6 +1284,9 @@ void fuse_wait_aborted(struct fuse_conn *fc); /* Check if any requests timed out */ void fuse_check_timeout(struct work_struct *work); +void fuse_dentry_tree_init(void); +void fuse_dentry_tree_cleanup(void); + /** * Invalidate inode attributes */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d1babf56f254..793f1766ae5a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -2294,6 +2294,8 @@ static int __init fuse_init(void) if (res) goto err_sysfs_cleanup; + fuse_dentry_tree_init(); + sanitize_global_limit(&max_user_bgreq); sanitize_global_limit(&max_user_congthresh); @@ -2313,6 +2315,7 @@ static void __exit fuse_exit(void) { pr_debug("exit\n"); + fuse_dentry_tree_cleanup(); fuse_ctl_cleanup(); fuse_sysfs_cleanup(); fuse_fs_cleanup(); From 64becd224ff99dbdcffab22709dfcf170e52aff1 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 16 Sep 2025 14:53:09 +0100 Subject: [PATCH 07/15] fuse: new work queue to invalidate dentries from old epochs With the infrastructure introduced to periodically invalidate expired dentries, it is now possible to add an extra work queue to invalidate dentries when an epoch is incremented. This work queue will only be triggered when the 'inval_wq' parameter is set. Signed-off-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 7 ++++--- fs/fuse/dir.c | 21 +++++++++++++++++++++ fs/fuse/fuse_i.h | 4 ++++ fs/fuse/inode.c | 2 ++ 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 49b18d7accb3..6d59cbc877c6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2041,13 +2041,14 @@ static int fuse_notify_resend(struct fuse_conn *fc) /* * Increments the fuse connection epoch. This will result of dentries from - * previous epochs to be invalidated. - * - * XXX optimization: add call to shrink_dcache_sb()? + * previous epochs to be invalidated. Additionally, if inval_wq is set, a work + * queue is scheduled to trigger the invalidation. */ static int fuse_notify_inc_epoch(struct fuse_conn *fc) { atomic_inc(&fc->epoch); + if (inval_wq) + schedule_work(&fc->epoch_work); return 0; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 77982fdbcf27..8ef8134e1cd5 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -189,6 +189,27 @@ static void fuse_dentry_tree_work(struct work_struct *work) secs_to_jiffies(inval_wq)); } +void fuse_epoch_work(struct work_struct *work) +{ + struct fuse_conn *fc = container_of(work, struct fuse_conn, + epoch_work); + struct fuse_mount *fm; + struct inode *inode; + + down_read(&fc->killsb); + + inode = fuse_ilookup(fc, FUSE_ROOT_ID, &fm); + iput(inode); + + if (fm) { + /* Remove all possible active references to cached inodes */ + shrink_dcache_sb(fm->sb); + } else + pr_warn("Failed to get root inode"); + + up_read(&fc->killsb); +} + void fuse_dentry_tree_init(void) { int i; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ac717b3b46a1..a80411028254 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -649,6 +649,8 @@ struct fuse_conn { /** Current epoch for up-to-date dentries */ atomic_t epoch; + struct work_struct epoch_work; + struct rcu_head rcu; /** The user id for this mount */ @@ -1287,6 +1289,8 @@ void fuse_check_timeout(struct work_struct *work); void fuse_dentry_tree_init(void); void fuse_dentry_tree_cleanup(void); +void fuse_epoch_work(struct work_struct *work); + /** * Invalidate inode attributes */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 793f1766ae5a..3087165a6004 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -977,6 +977,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); atomic_set(&fc->epoch, 1); + INIT_WORK(&fc->epoch_work, fuse_epoch_work); init_waitqueue_head(&fc->blocked_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); @@ -1029,6 +1030,7 @@ void fuse_conn_put(struct fuse_conn *fc) fuse_dax_conn_free(fc); if (fc->timeout.req_timeout) cancel_delayed_work_sync(&fc->timeout.work); + cancel_work_sync(&fc->epoch_work); if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); From b4909ae8d4e95a5046bcba099a3afdef8024b1b2 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 16 Sep 2025 14:53:10 +0100 Subject: [PATCH 08/15] fuse: refactor fuse_conn_put() to remove negative logic. There is no functional change with this patch. It simply refactors function fuse_conn_put() to not use negative logic, which makes it more easier to read. Signed-off-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3087165a6004..21e04c394a80 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1022,27 +1022,28 @@ static void delayed_release(struct rcu_head *p) void fuse_conn_put(struct fuse_conn *fc) { - if (refcount_dec_and_test(&fc->count)) { - struct fuse_iqueue *fiq = &fc->iq; - struct fuse_sync_bucket *bucket; + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_sync_bucket *bucket; - if (IS_ENABLED(CONFIG_FUSE_DAX)) - fuse_dax_conn_free(fc); - if (fc->timeout.req_timeout) - cancel_delayed_work_sync(&fc->timeout.work); - cancel_work_sync(&fc->epoch_work); - if (fiq->ops->release) - fiq->ops->release(fiq); - put_pid_ns(fc->pid_ns); - bucket = rcu_dereference_protected(fc->curr_bucket, 1); - if (bucket) { - WARN_ON(atomic_read(&bucket->count) != 1); - kfree(bucket); - } - if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) - fuse_backing_files_free(fc); - call_rcu(&fc->rcu, delayed_release); + if (!refcount_dec_and_test(&fc->count)) + return; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); + if (fc->timeout.req_timeout) + cancel_delayed_work_sync(&fc->timeout.work); + cancel_work_sync(&fc->epoch_work); + if (fiq->ops->release) + fiq->ops->release(fiq); + put_pid_ns(fc->pid_ns); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + if (bucket) { + WARN_ON(atomic_read(&bucket->count) != 1); + kfree(bucket); } + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_free(fc); + call_rcu(&fc->rcu, delayed_release); } EXPORT_SYMBOL_GPL(fuse_conn_put); From c755a09b52c09b8d67ef35b4ac3166ca6e797bee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miquel=20Sabat=C3=A9=20Sol=C3=A0?= Date: Mon, 29 Sep 2025 15:02:45 +0200 Subject: [PATCH 09/15] fuse: use strscpy instead of strcpy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As pointed out in [1], strcpy() is deprecated in favor of strscpy(). Furthermore, the size of the buffer for the name to be copied is well known at this point since we are going to move the pointer by that much on the next line. Hence, it's safe to assume 'namelen' for the size of the string to be copied. [1] https://github.com/KSPP/linux/issues/88 Signed-off-by: Miquel Sabaté Solà Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 8ef8134e1cd5..70f364aa6541 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -699,7 +699,7 @@ static int get_security_context(struct dentry *entry, umode_t mode, fctx->size = lsmctx.len; ptr += sizeof(*fctx); - strcpy(ptr, name); + strscpy(ptr, name, namelen); ptr += namelen; memcpy(ptr, lsmctx.context, lsmctx.len); From 47781ee71fb6bf2e082580b98be72411b99b6e04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miquel=20Sabat=C3=A9=20Sol=C3=A0?= Date: Mon, 29 Sep 2025 15:02:46 +0200 Subject: [PATCH 10/15] fuse: rename 'namelen' to 'namesize' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By "length of a string" usually the number of non-null chars is meant (i.e. strlen(str)). So the variable 'namelen' was confusingly named, whereas 'namesize' refers more to what's being done in 'get_security_context'. Suggested-by: Miklos Szeredi Signed-off-by: Miquel Sabaté Solà Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 70f364aa6541..963f53f394c6 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -666,7 +666,7 @@ static int get_security_context(struct dentry *entry, umode_t mode, u32 total_len = sizeof(*header); int err, nr_ctx = 0; const char *name = NULL; - size_t namelen; + size_t namesize; err = security_dentry_init_security(entry, mode, &entry->d_name, &name, &lsmctx); @@ -677,12 +677,12 @@ static int get_security_context(struct dentry *entry, umode_t mode, if (lsmctx.len) { nr_ctx = 1; - namelen = strlen(name) + 1; + namesize = strlen(name) + 1; err = -EIO; - if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || + if (WARN_ON(namesize > XATTR_NAME_MAX + 1 || lsmctx.len > S32_MAX)) goto out_err; - total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + + total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namesize + lsmctx.len); } @@ -699,8 +699,8 @@ static int get_security_context(struct dentry *entry, umode_t mode, fctx->size = lsmctx.len; ptr += sizeof(*fctx); - strscpy(ptr, name, namelen); - ptr += namelen; + strscpy(ptr, name, namesize); + ptr += namesize; memcpy(ptr, lsmctx.context, lsmctx.len); } From b359af8275a982a458e8df6c6beab1415be1f795 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Thu, 23 Oct 2025 00:21:17 +0200 Subject: [PATCH 11/15] fuse: Invalidate the page cache after FOPEN_DIRECT_IO write generic_file_direct_write() also does this and has a large comment about. Reproducer here is xfstest's generic/209, which is exactly to have competing DIO write and cached IO read. Signed-off-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4d96e684d736..b60f394df5a3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1693,6 +1693,15 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (res > 0) *ppos = pos; + if (res > 0 && write && fopen_direct_io) { + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + invalidate_inode_pages2_range(mapping, idx_from, idx_to); + } + return res > 0 ? res : err; } EXPORT_SYMBOL_GPL(fuse_direct_io); From 1ce120dcefc056ce8af2486cebbb77a458aad4c3 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Thu, 23 Oct 2025 00:21:18 +0200 Subject: [PATCH 12/15] fuse: Always flush the page cache before FOPEN_DIRECT_IO write This was done as condition on direct_io_allow_mmap, but I believe this is not right, as a file might be open two times - once with write-back enabled another time with FOPEN_DIRECT_IO. Signed-off-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b60f394df5a3..6014d588845c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1619,7 +1619,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!ia) return -ENOMEM; - if (fopen_direct_io && fc->direct_io_allow_mmap) { + if (fopen_direct_io) { res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); if (res) { fuse_io_free(ia); From 28fec8b95e67704df7b841dc4cbbba0957078213 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 5 Sep 2025 09:30:48 -0700 Subject: [PATCH 13/15] fuse: signal that a fuse inode should exhibit local fs behaviors Create a new fuse inode flag that indicates that the kernel should implement various local filesystem behaviors instead of passing vfs commands straight through to the fuse server and expecting the server to do all the work. For example, this means that we'll use the kernel to transform some ACL updates into mode changes, and later to do enforcement of the immutable and append iflags. Signed-off-by: "Darrick J. Wong" Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a80411028254..026c6c0de3f4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -239,6 +239,11 @@ enum { FUSE_I_BTIME, /* Wants or already has page cache IO */ FUSE_I_CACHE_IO_MODE, + /* + * Client has exclusive access to the inode, either because fs is local + * or the fuse server has an exclusive "lease" on distributed fs + */ + FUSE_I_EXCLUSIVE, }; struct fuse_conn; @@ -1055,7 +1060,7 @@ static inline struct fuse_conn *get_fuse_conn(struct inode *inode) return get_fuse_mount_super(inode->i_sb)->fc; } -static inline struct fuse_inode *get_fuse_inode(struct inode *inode) +static inline struct fuse_inode *get_fuse_inode(const struct inode *inode) { return container_of(inode, struct fuse_inode, inode); } @@ -1097,6 +1102,13 @@ static inline bool fuse_is_bad(struct inode *inode) return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state)); } +static inline bool fuse_inode_is_exclusive(const struct inode *inode) +{ + const struct fuse_inode *fi = get_fuse_inode(inode); + + return test_bit(FUSE_I_EXCLUSIVE, &fi->state); +} + static inline struct folio **fuse_folios_alloc(unsigned int nfolios, gfp_t flags, struct fuse_folio_desc **desc) { From 95c39eef7c2b666026c69ab5b30471da94ea2874 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 25 Nov 2025 10:13:47 -0800 Subject: [PATCH 14/15] fuse: fix io-uring list corruption for terminated non-committed requests When a request is terminated before it has been committed, the request is not removed from the queue's list. This leaves a dangling list entry that leads to list corruption and use-after-free issues. Remove the request from the queue's list for terminated non-committed requests. Signed-off-by: Joanne Koong Fixes: c090c8abae4b ("fuse: Add io-uring sqe commit and fetch support") Cc: stable@vger.kernel.org Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/dev_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 0066c9c0a5d5..7760fe4e1f9e 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -86,6 +86,7 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, lockdep_assert_not_held(&queue->lock); spin_lock(&queue->lock); ent->fuse_req = NULL; + list_del_init(&req->list); if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); From 8da059f2a497a2427150faae5adc3bb78e73b3e2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 24 Nov 2025 09:04:56 +0300 Subject: [PATCH 15/15] fuse: Uninitialized variable in fuse_epoch_work() The fuse_ilookup() function only sets *fm on the success path so this "if (fm) {" NULL check doesn't work. The "fm" pointer is either uninitialized or valid. Check the "inode" pointer instead. Also, while it's not necessary, it is cleaner to move the iput(inode) under the NULL check as well. Fixes: 64becd224ff9 ("fuse: new work queue to invalidate dentries from old epochs") Signed-off-by: Dan Carpenter Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 963f53f394c6..2aec225740a0 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -199,9 +199,8 @@ void fuse_epoch_work(struct work_struct *work) down_read(&fc->killsb); inode = fuse_ilookup(fc, FUSE_ROOT_ID, &fm); - iput(inode); - - if (fm) { + if (inode) { + iput(inode); /* Remove all possible active references to cached inodes */ shrink_dcache_sb(fm->sb); } else