From 360600f8ec635c3d47789ade3d3c120c88981343 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 19:05:21 -0400 Subject: [PATCH 01/64] fs/namespace.c: fix the namespace_sem guard mess If anything, namespace_lock should be DEFINE_LOCK_GUARD_0, not DEFINE_GUARD. That way we * do not need to feed it a bogus argument * do not get gcc trying to compare an address of static in file variable with -4097 - and, if we are unlucky, trying to keep it in a register, with spills and all such. The same problems apply to grabbing namespace_sem shared. Rename it to namespace_excl, add namespace_shared, convert the existing users: guard(namespace_lock, &namespace_sem) => guard(namespace_excl)() guard(rwsem_read, &namespace_sem) => guard(namespace_shared)() scoped_guard(namespace_lock, &namespace_sem) => scoped_guard(namespace_excl) scoped_guard(rwsem_read, &namespace_sem) => scoped_guard(namespace_shared) Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ae6d1312b184..fcea65587ff9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -82,6 +82,12 @@ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ static DEFINE_SEQLOCK(mnt_ns_tree_lock); +static inline void namespace_lock(void); +static void namespace_unlock(void); +DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock()) +DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem), + up_read(&namespace_sem)) + #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif @@ -1776,8 +1782,6 @@ static inline void namespace_lock(void) down_write(&namespace_sem); } -DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock()) - enum umount_tree_flags { UMOUNT_SYNC = 1, UMOUNT_PROPAGATE = 2, @@ -2306,7 +2310,7 @@ struct path *collect_paths(const struct path *path, struct path *res = prealloc, *to_free = NULL; unsigned n = 0; - guard(rwsem_read)(&namespace_sem); + guard(namespace_shared)(); if (!check_mnt(root)) return ERR_PTR(-EINVAL); @@ -2361,7 +2365,7 @@ void dissolve_on_fput(struct vfsmount *mnt) return; } - scoped_guard(namespace_lock, &namespace_sem) { + scoped_guard(namespace_excl) { if (!anon_ns_root(m)) return; @@ -2435,7 +2439,7 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; - guard(rwsem_read)(&namespace_sem); + guard(namespace_shared)(); if (IS_MNT_UNBINDABLE(old_mnt)) return ERR_PTR(-EINVAL); @@ -5957,7 +5961,7 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, if (ret) return ret; - scoped_guard(rwsem_read, &namespace_sem) + scoped_guard(namespace_shared) ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns); if (!ret) @@ -6079,7 +6083,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, * We only need to guard against mount topology changes as * listmount() doesn't care about any mount properties. */ - scoped_guard(rwsem_read, &namespace_sem) + scoped_guard(namespace_shared) ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids, nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); if (ret <= 0) From d154f185758994ea2cc2c15575a3653334562184 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 19:54:45 -0400 Subject: [PATCH 02/64] introduced guards for mount_lock mount_writer: write_seqlock; that's an equivalent of {un,}lock_mount_hash() mount_locked_reader: read_seqlock_excl; these tend to be open-coded. No bulk conversions, please - if nothing else, quite a few places take use mount_writer form when mount_locked_reader is sufficent. It needs to be dealt with carefully. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/mount.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/mount.h b/fs/mount.h index 97737051a8b9..ed8c83ba836a 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -154,6 +154,11 @@ static inline void get_mnt_ns(struct mnt_namespace *ns) extern seqlock_t mount_lock; +DEFINE_LOCK_GUARD_0(mount_writer, write_seqlock(&mount_lock), + write_sequnlock(&mount_lock)) +DEFINE_LOCK_GUARD_0(mount_locked_reader, read_seqlock_excl(&mount_lock), + read_sequnlock_excl(&mount_lock)) + struct proc_mounts { struct mnt_namespace *ns; struct path root; From 547af12dcd43851a4894ffc3c60e381ab9f0f5e7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Aug 2025 02:06:57 -0400 Subject: [PATCH 03/64] fs/namespace.c: allow to drop vfsmount references via __free(mntput) Note that just as path_put, it should never be done in scope of namespace_sem, be it shared or exclusive. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index fcea65587ff9..767ab751ee2a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -88,6 +88,8 @@ DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock()) DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem), up_read(&namespace_sem)) +DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T)) + #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif From 902e9904672bedcb25c6740d5c1d09e17de807eb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 20:12:47 -0400 Subject: [PATCH 04/64] __detach_mounts(): use guards Clean fit for guards use; guards can't be weaker due to umount_tree() calls. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 767ab751ee2a..1ae1ab8815c9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2032,10 +2032,11 @@ void __detach_mounts(struct dentry *dentry) struct pinned_mountpoint mp = {}; struct mount *mnt; - namespace_lock(); - lock_mount_hash(); + guard(namespace_excl)(); + guard(mount_writer)(); + if (!lookup_mountpoint(dentry, &mp)) - goto out_unlock; + return; event++; while (mp.node.next) { @@ -2047,9 +2048,6 @@ void __detach_mounts(struct dentry *dentry) else umount_tree(mnt, UMOUNT_CONNECTED); } unpin_mountpoint(&mp); -out_unlock: - unlock_mount_hash(); - namespace_unlock(); } /* From 4151c3cc58698584e8b0427c40eeda671c3e08c6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 20:14:22 -0400 Subject: [PATCH 05/64] __is_local_mountpoint(): use guards clean fit; namespace_shared due to iterating through ns->mounts. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 1ae1ab8815c9..f1460ddd1486 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -906,17 +906,14 @@ bool __is_local_mountpoint(const struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt, *n; - bool is_covered = false; - down_read(&namespace_sem); - rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { - is_covered = (mnt->mnt_mountpoint == dentry); - if (is_covered) - break; - } - up_read(&namespace_sem); + guard(namespace_shared)(); - return is_covered; + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) + if (mnt->mnt_mountpoint == dentry) + return true; + + return false; } struct pinned_mountpoint { From 12cdd1af7a6d123fc4690bae1807d8c90c5c5580 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 20:16:46 -0400 Subject: [PATCH 06/64] do_change_type(): use guards clean fit; namespace_excl to modify propagation graph Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index f1460ddd1486..a6a7b068770a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2899,7 +2899,7 @@ static int do_change_type(struct path *path, int ms_flags) struct mount *mnt = real_mount(path->mnt); int recurse = ms_flags & MS_REC; int type; - int err = 0; + int err; if (!path_mounted(path)) return -EINVAL; @@ -2908,23 +2908,22 @@ static int do_change_type(struct path *path, int ms_flags) if (!type) return -EINVAL; - namespace_lock(); + guard(namespace_excl)(); + err = may_change_propagation(mnt); if (err) - goto out_unlock; + return err; if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) - goto out_unlock; + return err; } for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - out_unlock: - namespace_unlock(); - return err; + return 0; } /* may_copy_tree() - check if a mount tree can be copied From 7b99ee2c5c83b2df03c50bb2d61bdceb76189352 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 20:19:05 -0400 Subject: [PATCH 07/64] do_set_group(): use guards clean fit; namespace_excl to modify propagation graph Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index a6a7b068770a..13e2f3837a26 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3349,47 +3349,44 @@ static inline int tree_contains_unbindable(struct mount *mnt) static int do_set_group(struct path *from_path, struct path *to_path) { - struct mount *from, *to; + struct mount *from = real_mount(from_path->mnt); + struct mount *to = real_mount(to_path->mnt); int err; - from = real_mount(from_path->mnt); - to = real_mount(to_path->mnt); - - namespace_lock(); + guard(namespace_excl)(); err = may_change_propagation(from); if (err) - goto out; + return err; err = may_change_propagation(to); if (err) - goto out; + return err; - err = -EINVAL; /* To and From paths should be mount roots */ if (!path_mounted(from_path)) - goto out; + return -EINVAL; if (!path_mounted(to_path)) - goto out; + return -EINVAL; /* Setting sharing groups is only allowed across same superblock */ if (from->mnt.mnt_sb != to->mnt.mnt_sb) - goto out; + return -EINVAL; /* From mount root should be wider than To mount root */ if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root)) - goto out; + return -EINVAL; /* From mount should not have locked children in place of To's root */ if (__has_locked_children(from, to->mnt.mnt_root)) - goto out; + return -EINVAL; /* Setting sharing groups is only allowed on private mounts */ if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to)) - goto out; + return -EINVAL; /* From should not be private */ if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from)) - goto out; + return -EINVAL; if (IS_MNT_SLAVE(from)) { hlist_add_behind(&to->mnt_slave, &from->mnt_slave); @@ -3401,11 +3398,7 @@ static int do_set_group(struct path *from_path, struct path *to_path) list_add(&to->mnt_share, &from->mnt_share); set_mnt_shared(to); } - - err = 0; -out: - namespace_unlock(); - return err; + return 0; } /** From 550dda45df9e54d6b0cd34456f5987e0c7d7522a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 20:20:44 -0400 Subject: [PATCH 08/64] mark_mounts_for_expiry(): use guards Clean fit; guards can't be weaker due to umount_tree() calls. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 13e2f3837a26..898a6b7307e4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3886,8 +3886,8 @@ void mark_mounts_for_expiry(struct list_head *mounts) if (list_empty(mounts)) return; - namespace_lock(); - lock_mount_hash(); + guard(namespace_excl)(); + guard(mount_writer)(); /* extract from the expiration list every vfsmount that matches the * following criteria: @@ -3909,8 +3909,6 @@ void mark_mounts_for_expiry(struct list_head *mounts) touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); } - unlock_mount_hash(); - namespace_unlock(); } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); From 61e68af33ac7977ea2457a861ab8eeb2406cb3f1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 20:22:10 -0400 Subject: [PATCH 09/64] put_mnt_ns(): use guards clean fit; guards can't be weaker due to umount_tree() call. Setting emptied_ns requires namespace_excl, but not anything mount_lock-related. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 898a6b7307e4..86a86be2b0ef 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6153,12 +6153,10 @@ void put_mnt_ns(struct mnt_namespace *ns) { if (!refcount_dec_and_test(&ns->ns.count)) return; - namespace_lock(); + guard(namespace_excl)(); emptied_ns = ns; - lock_mount_hash(); + guard(mount_writer)(); umount_tree(ns->root, 0); - unlock_mount_hash(); - namespace_unlock(); } struct vfsmount *kern_mount(struct file_system_type *type) From 747e91e5b7396eba699989072c61e8fb076bbea4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 20:24:23 -0400 Subject: [PATCH 10/64] mnt_already_visible(): use guards clean fit; namespace_shared due to iterating through ns->mounts. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 86a86be2b0ef..a5d37b97088f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6232,9 +6232,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns, { int new_flags = *new_mnt_flags; struct mount *mnt, *n; - bool visible = false; - down_read(&namespace_sem); + guard(namespace_shared)(); rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { struct mount *child; int mnt_flags; @@ -6281,13 +6280,10 @@ static bool mnt_already_visible(struct mnt_namespace *ns, /* Preserve the locked attributes */ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ MNT_LOCK_ATIME); - visible = true; - goto found; + return true; next: ; } -found: - up_read(&namespace_sem); - return visible; + return false; } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) From 6b448d7a7c4848fd6cbe0eee49834df7d25b25f8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 20:28:35 -0400 Subject: [PATCH 11/64] check_for_nsfs_mounts(): no need to take locks Currently we are taking mount_writer; what that function needs is either mount_locked_reader (we are not changing anything, we just want to iterate through the subtree) or namespace_shared and a reference held by caller on the root of subtree - that's also enough to stabilize the topology. The thing is, all callers are already holding at least namespace_shared as well as a reference to the root of subtree. Let's make the callers provide locking warranties - don't mess with mount_lock in check_for_nsfs_mounts() itself and document the locking requirements. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index a5d37b97088f..59948cbf9c47 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2402,21 +2402,15 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) * specified subtree. Such references can act as pins for mount namespaces * that aren't checked by the mount-cycle checking code, thereby allowing * cycles to be made. + * + * locks: mount_locked_reader || namespace_shared && pinned(subtree) */ static bool check_for_nsfs_mounts(struct mount *subtree) { - struct mount *p; - bool ret = false; - - lock_mount_hash(); - for (p = subtree; p; p = next_mnt(p, subtree)) + for (struct mount *p = subtree; p; p = next_mnt(p, subtree)) if (mnt_ns_loop(p->mnt.mnt_root)) - goto out; - - ret = true; -out: - unlock_mount_hash(); - return ret; + return false; + return true; } /** From 511db073b2150388dd05258ca2d7045dd0784033 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 21:40:07 -0400 Subject: [PATCH 12/64] propagate_mnt(): use scoped_guard(mount_locked_reader) for mnt_set_mountpoint() Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/pnode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/pnode.c b/fs/pnode.c index 6f7d02f3fa98..0702d45d856d 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -304,9 +304,8 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, err = PTR_ERR(this); break; } - read_seqlock_excl(&mount_lock); - mnt_set_mountpoint(n, dest_mp, this); - read_sequnlock_excl(&mount_lock); + scoped_guard(mount_locked_reader) + mnt_set_mountpoint(n, dest_mp, this); if (n->mnt_master) SET_MNT_MARK(n->mnt_master); copy = this; From f80b84358f65167546b6cd58f867ef5669f22f3d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 21:48:35 -0400 Subject: [PATCH 13/64] has_locked_children(): use guards ... and document the locking requirements of __has_locked_children() Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 59948cbf9c47..2cb3cb8307ca 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2373,6 +2373,7 @@ void dissolve_on_fput(struct vfsmount *mnt) } } +/* locks: namespace_shared && pinned(mnt) || mount_locked_reader */ static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; @@ -2389,12 +2390,8 @@ static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) bool has_locked_children(struct mount *mnt, struct dentry *dentry) { - bool res; - - read_seqlock_excl(&mount_lock); - res = __has_locked_children(mnt, dentry); - read_sequnlock_excl(&mount_lock); - return res; + guard(mount_locked_reader)(); + return __has_locked_children(mnt, dentry); } /* From 2605d8684320e6c09e56743e66ecac8ae581b464 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 21:58:03 -0400 Subject: [PATCH 14/64] mnt_set_expiry(): use guards The reason why it needs only mount_locked_reader is that there's no lockless accesses of expiry lists. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 2cb3cb8307ca..db25c81d7f68 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3858,9 +3858,8 @@ int finish_automount(struct vfsmount *m, const struct path *path) */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { - read_seqlock_excl(&mount_lock); + guard(mount_locked_reader)(); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); - read_sequnlock_excl(&mount_lock); } EXPORT_SYMBOL(mnt_set_expiry); From 2aec880c1cdf1cf2dab931b4dd8744c415311bc9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 21 Aug 2025 20:48:38 -0400 Subject: [PATCH 15/64] path_is_under(): use guards ... and document that locking requirements for is_path_reachable(). There is one questionable caller in do_listmount() where we are not holding mount_lock *and* might not have the first argument mounted. However, in that case it will immediately return true without having to look at the ancestors. Might be cleaner to move the check into non-LSTM_ROOT case which it really belongs in - there the check is not always true and is_mounted() is guaranteed. Document the locking environments for is_path_reachable() callers: get_peer_under_root() get_dominating_id() do_statmount() do_listmount() Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 11 +++++------ fs/pnode.c | 3 ++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index db25c81d7f68..6aabf0045389 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4592,7 +4592,7 @@ SYSCALL_DEFINE5(move_mount, /* * Return true if path is reachable from root * - * namespace_sem or mount_lock is held + * locks: mount_locked_reader || namespace_shared && is_mounted(mnt) */ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, const struct path *root) @@ -4606,11 +4606,8 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, bool path_is_under(const struct path *path1, const struct path *path2) { - bool res; - read_seqlock_excl(&mount_lock); - res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); - read_sequnlock_excl(&mount_lock); - return res; + guard(mount_locked_reader)(); + return is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); } EXPORT_SYMBOL(path_is_under); @@ -5689,6 +5686,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) STATMOUNT_MNT_UIDMAP | \ STATMOUNT_MNT_GIDMAP) +/* locks: namespace_shared */ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { @@ -5949,6 +5947,7 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, return ret; } +/* locks: namespace_shared */ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids, bool reverse) diff --git a/fs/pnode.c b/fs/pnode.c index 0702d45d856d..edaf9d9d0eaf 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -29,6 +29,7 @@ static inline struct mount *next_slave(struct mount *p) return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave); } +/* locks: namespace_shared && is_mounted(mnt) */ static struct mount *get_peer_under_root(struct mount *mnt, struct mnt_namespace *ns, const struct path *root) @@ -50,7 +51,7 @@ static struct mount *get_peer_under_root(struct mount *mnt, * Get ID of closest dominating peer group having a representative * under the given root. * - * Caller must hold namespace_sem + * locks: namespace_shared */ int get_dominating_id(struct mount *mnt, const struct path *root) { From 6b6516c56b04d556ce57deab1b8c2833e4af26c6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 2 Jul 2025 01:39:45 -0400 Subject: [PATCH 16/64] current_chrooted(): don't bother with follow_down_one() All we need here is to follow ->overmount on root mount of namespace... Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 6aabf0045389..cf680fbf015e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6194,24 +6194,22 @@ bool our_mnt(struct vfsmount *mnt) bool current_chrooted(void) { /* Does the current process have a non-standard root */ - struct path ns_root; + struct mount *root = current->nsproxy->mnt_ns->root; struct path fs_root; bool chrooted; - /* Find the namespace root */ - ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; - ns_root.dentry = ns_root.mnt->mnt_root; - path_get(&ns_root); - while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) - ; - get_fs_root(current->fs, &fs_root); - chrooted = !path_equal(&fs_root, &ns_root); + /* Find the namespace root */ + read_seqlock_excl(&mount_lock); + while (unlikely(root->overmount)) + root = root->overmount; + + chrooted = fs_root.mnt != &root->mnt || !path_mounted(&fs_root); + + read_sequnlock_excl(&mount_lock); path_put(&fs_root); - path_put(&ns_root); - return chrooted; } From 8281f98a68d3306b2e47e6bb00708525562ccfdf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 21 Aug 2025 21:02:06 -0400 Subject: [PATCH 17/64] current_chrooted(): use guards here a use of __free(path_put) for dropping fs_root is enough to make guard(mount_locked_reader) fit... Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index cf680fbf015e..0474b3a93dbf 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6194,23 +6194,20 @@ bool our_mnt(struct vfsmount *mnt) bool current_chrooted(void) { /* Does the current process have a non-standard root */ - struct mount *root = current->nsproxy->mnt_ns->root; - struct path fs_root; - bool chrooted; + struct path fs_root __free(path_put) = {}; + struct mount *root; get_fs_root(current->fs, &fs_root); /* Find the namespace root */ - read_seqlock_excl(&mount_lock); + guard(mount_locked_reader)(); + + root = current->nsproxy->mnt_ns->root; while (unlikely(root->overmount)) root = root->overmount; - chrooted = fs_root.mnt != &root->mnt || !path_mounted(&fs_root); - - read_sequnlock_excl(&mount_lock); - path_put(&fs_root); - return chrooted; + return fs_root.mnt != &root->mnt || !path_mounted(&fs_root); } static bool mnt_already_visible(struct mnt_namespace *ns, From 5423426a79dd4d4422750b063febed40f362afcc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Aug 2025 13:57:42 -0400 Subject: [PATCH 18/64] switch do_new_mount_fc() to fc_mount() Prior to the call of do_new_mount_fc() the caller has just done successful vfs_get_tree(). Then do_new_mount_fc() does several checks on resulting superblock, and either does fc_drop_locked() and returns an error or proceeds to unlock the superblock and call vfs_create_mount(). The thing is, there's no reason to delay that unlock + vfs_create_mount() - the tests do not rely upon the state of ->s_umount and fc_drop_locked() put_fs_context() is equivalent to unlock ->s_umount put_fs_context() Doing vfs_create_mount() before the checks allows us to move vfs_get_tree() from caller to do_new_mount_fc() and collapse it with vfs_create_mount() into an fc_mount() call. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 0474b3a93dbf..9b575c9eee0b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3705,25 +3705,20 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { - struct vfsmount *mnt; struct pinned_mountpoint mp = {}; - struct super_block *sb = fc->root->d_sb; + struct super_block *sb; + struct vfsmount *mnt = fc_mount(fc); int error; + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + sb = fc->root->d_sb; error = security_sb_kern_mount(sb); if (!error && mount_too_revealing(sb, &mnt_flags)) error = -EPERM; - - if (unlikely(error)) { - fc_drop_locked(fc); - return error; - } - - up_write(&sb->s_umount); - - mnt = vfs_create_mount(fc); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); + if (unlikely(error)) + goto out; mnt_warn_timestamp_expiry(mountpoint, mnt); @@ -3731,10 +3726,12 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, if (!error) { error = do_add_mount(real_mount(mnt), mp.mp, mountpoint, mnt_flags); + if (!error) + mnt = NULL; // consumed on success unlock_mount(&mp); } - if (error < 0) - mntput(mnt); +out: + mntput(mnt); return error; } @@ -3788,8 +3785,6 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, err = parse_monolithic_mount_data(fc, data); if (!err && !mount_capable(fc)) err = -EPERM; - if (!err) - err = vfs_get_tree(fc); if (!err) err = do_new_mount_fc(fc, path, mnt_flags); From a666bbcf7e9c5e0524b60677a0a43a4cddedcefd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Aug 2025 21:34:03 -0400 Subject: [PATCH 19/64] do_move_mount(): trim local variables Both 'parent' and 'ns' are used at most once, no point precalculating those... Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 9b575c9eee0b..ad9b5687ff15 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3564,10 +3564,8 @@ static inline bool may_use_mount(struct mount *mnt) static int do_move_mount(struct path *old_path, struct path *new_path, enum mnt_tree_flags_t flags) { - struct mnt_namespace *ns; struct mount *p; struct mount *old; - struct mount *parent; struct pinned_mountpoint mp; int err; bool beneath = flags & MNT_TREE_BENEATH; @@ -3578,8 +3576,6 @@ static int do_move_mount(struct path *old_path, old = real_mount(old_path->mnt); p = real_mount(new_path->mnt); - parent = old->mnt_parent; - ns = old->mnt_ns; err = -EINVAL; @@ -3588,12 +3584,12 @@ static int do_move_mount(struct path *old_path, /* ... it should be detachable from parent */ if (!mnt_has_parent(old) || IS_MNT_LOCKED(old)) goto out; + /* ... which should not be shared */ + if (IS_MNT_SHARED(old->mnt_parent)) + goto out; /* ... and the target should be in our namespace */ if (!check_mnt(p)) goto out; - /* parent of the source should not be shared */ - if (IS_MNT_SHARED(parent)) - goto out; } else { /* * otherwise the source must be the root of some anon namespace. @@ -3605,7 +3601,7 @@ static int do_move_mount(struct path *old_path, * subsequent checks would've rejected that, but they lose * some corner cases if we check it early. */ - if (ns == p->mnt_ns) + if (old->mnt_ns == p->mnt_ns) goto out; /* * Target should be either in our namespace or in an acceptable From c1ab70be88f3135cdb931673aae2afc5d6624b62 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 14:50:09 -0400 Subject: [PATCH 20/64] do_move_mount(): deal with the checks on old_path early 1) checking that location we want to move does point to root of some mount can be done before anything else; that property is not going to change and having it already verified simplifies the analysis. 2) checking the type agreement between what we are trying to move and what we are trying to move it onto also belongs in the very beginning - do_lock_mount() might end up switching new_path to something that overmounts the original location, but... the same type agreement applies to overmounts, so we could just as well check against the original location. 3) since we know that old_path->dentry is the root of old_path->mnt, there's no point bothering with path_is_overmounted() in can_move_mount_beneath(); it's simply a check for the mount we are trying to move having non-NULL ->overmount. And with that, we can switch can_move_mount_beneath() to taking old instead of old_path, leaving no uses of old_path past the original checks. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ad9b5687ff15..74c67ea1b5a8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3433,7 +3433,7 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) /** * can_move_mount_beneath - check that we can mount beneath the top mount - * @from: mount to mount beneath + * @mnt_from: mount we are trying to move * @to: mount under which to mount * @mp: mountpoint of @to * @@ -3443,7 +3443,7 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) * root or the rootfs of the namespace. * - Make sure that the caller can unmount the topmost mount ensuring * that the caller could reveal the underlying mountpoint. - * - Ensure that nothing has been mounted on top of @from before we + * - Ensure that nothing has been mounted on top of @mnt_from before we * grabbed @namespace_sem to avoid creating pointless shadow mounts. * - Prevent mounting beneath a mount if the propagation relationship * between the source mount, parent mount, and top mount would lead to @@ -3452,12 +3452,11 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) * Context: This function expects namespace_lock() to be held. * Return: On success 0, and on error a negative error code is returned. */ -static int can_move_mount_beneath(const struct path *from, +static int can_move_mount_beneath(struct mount *mnt_from, const struct path *to, const struct mountpoint *mp) { - struct mount *mnt_from = real_mount(from->mnt), - *mnt_to = real_mount(to->mnt), + struct mount *mnt_to = real_mount(to->mnt), *parent_mnt_to = mnt_to->mnt_parent; if (!mnt_has_parent(mnt_to)) @@ -3470,7 +3469,7 @@ static int can_move_mount_beneath(const struct path *from, return -EINVAL; /* Avoid creating shadow mounts during mount propagation. */ - if (path_overmounted(from)) + if (mnt_from->overmount) return -EINVAL; /* @@ -3565,16 +3564,21 @@ static int do_move_mount(struct path *old_path, struct path *new_path, enum mnt_tree_flags_t flags) { struct mount *p; - struct mount *old; + struct mount *old = real_mount(old_path->mnt); struct pinned_mountpoint mp; int err; bool beneath = flags & MNT_TREE_BENEATH; + if (!path_mounted(old_path)) + return -EINVAL; + + if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) + return -EINVAL; + err = do_lock_mount(new_path, &mp, beneath); if (err) return err; - old = real_mount(old_path->mnt); p = real_mount(new_path->mnt); err = -EINVAL; @@ -3611,15 +3615,8 @@ static int do_move_mount(struct path *old_path, goto out; } - if (!path_mounted(old_path)) - goto out; - - if (d_is_dir(new_path->dentry) != - d_is_dir(old_path->dentry)) - goto out; - if (beneath) { - err = can_move_mount_beneath(old_path, new_path, mp.mp); + err = can_move_mount_beneath(old, new_path, mp.mp); if (err) goto out; From d29da1a8f119130e6fc7d5d71029d402dabe2cb0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Aug 2025 23:54:39 -0400 Subject: [PATCH 21/64] move_mount(2): take sanity checks in 'beneath' case into do_lock_mount() We want to mount beneath the given location. For that operation to make sense, location must be the root of some mount that has something under it. Currently we let it proceed if those requirements are not met, with rather meaningless results, and have that bogosity caught further down the road; let's fail early instead - do_lock_mount() doesn't make sense unless those conditions hold, and checking them there makes things simpler. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 74c67ea1b5a8..86c6dd432b13 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2768,12 +2768,19 @@ static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bo struct path under = {}; int err = -ENOENT; + if (unlikely(beneath) && !path_mounted(path)) + return -EINVAL; + for (;;) { struct mount *m = real_mount(mnt); if (beneath) { path_put(&under); read_seqlock_excl(&mount_lock); + if (unlikely(!mnt_has_parent(m))) { + read_sequnlock_excl(&mount_lock); + return -EINVAL; + } under.mnt = mntget(&m->mnt_parent->mnt); under.dentry = dget(m->mnt_mountpoint); read_sequnlock_excl(&mount_lock); @@ -3437,8 +3444,6 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) * @to: mount under which to mount * @mp: mountpoint of @to * - * - Make sure that @to->dentry is actually the root of a mount under - * which we can mount another mount. * - Make sure that nothing can be mounted beneath the caller's current * root or the rootfs of the namespace. * - Make sure that the caller can unmount the topmost mount ensuring @@ -3459,12 +3464,6 @@ static int can_move_mount_beneath(struct mount *mnt_from, struct mount *mnt_to = real_mount(to->mnt), *parent_mnt_to = mnt_to->mnt_parent; - if (!mnt_has_parent(mnt_to)) - return -EINVAL; - - if (!path_mounted(to)) - return -EINVAL; - if (IS_MNT_LOCKED(mnt_to)) return -EINVAL; From 11941610b06820c4af7f1ff12071f159b3bf771d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 22 Aug 2025 13:07:43 -0400 Subject: [PATCH 22/64] finish_automount(): simplify the ELOOP check It's enough to check that dentries match; if path->dentry is equal to m->mnt_root, superblocks will match as well. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 86c6dd432b13..bdb33270ac6e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3798,8 +3798,7 @@ int finish_automount(struct vfsmount *m, const struct path *path) mnt = real_mount(m); - if (m->mnt_sb == path->mnt->mnt_sb && - m->mnt_root == dentry) { + if (m->mnt_root == path->dentry) { err = -ELOOP; goto discard; } From 76dfde13d68a2a6e6c2409407558ee908491df36 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 21 Aug 2025 22:28:51 -0400 Subject: [PATCH 23/64] do_loopback(): use __free(path_put) to deal with old_path preparations for making unlock_mount() a __cleanup(); can't have path_put() inside mount_lock scope. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index bdb33270ac6e..245cf2d19a6b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3014,7 +3014,7 @@ static struct mount *__do_loopback(struct path *old_path, int recurse) static int do_loopback(struct path *path, const char *old_name, int recurse) { - struct path old_path; + struct path old_path __free(path_put) = {}; struct mount *mnt = NULL, *parent; struct pinned_mountpoint mp = {}; int err; @@ -3024,13 +3024,12 @@ static int do_loopback(struct path *path, const char *old_name, if (err) return err; - err = -EINVAL; if (mnt_ns_loop(old_path.dentry)) - goto out; + return -EINVAL; err = lock_mount(path, &mp); if (err) - goto out; + return err; parent = real_mount(path->mnt); if (!check_mnt(parent)) @@ -3050,8 +3049,6 @@ static int do_loopback(struct path *path, const char *old_name, } out2: unlock_mount(&mp); -out: - path_put(&old_path); return err; } From 6bbbc4a04a10ebdb633501dc87aac5b0d6b80ec8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 21 Aug 2025 22:40:14 -0400 Subject: [PATCH 24/64] pivot_root(2): use __free() to deal with struct path in it preparations for making unlock_mount() a __cleanup(); can't have path_put() inside mount_lock scope. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 245cf2d19a6b..90b62ee882da 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4622,7 +4622,9 @@ EXPORT_SYMBOL(path_is_under); SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { - struct path new, old, root; + struct path new __free(path_put) = {}; + struct path old __free(path_put) = {}; + struct path root __free(path_put) = {}; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; struct pinned_mountpoint old_mp = {}; int error; @@ -4633,21 +4635,21 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = user_path_at(AT_FDCWD, new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); if (error) - goto out0; + return error; error = user_path_at(AT_FDCWD, put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); if (error) - goto out1; + return error; error = security_sb_pivotroot(&old, &new); if (error) - goto out2; + return error; get_fs_root(current->fs, &root); error = lock_mount(&old, &old_mp); if (error) - goto out3; + return error; error = -EINVAL; new_mnt = real_mount(new.mnt); @@ -4705,13 +4707,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = 0; out4: unlock_mount(&old_mp); -out3: - path_put(&root); -out2: - path_put(&old); -out1: - path_put(&new); -out0: return error; } From 9bf5d488529b9efa01bd292633482acd10277b90 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 22 Aug 2025 12:59:10 -0400 Subject: [PATCH 25/64] finish_automount(): take the lock_mount() analogue into a helper finish_automount() can't use lock_mount() - it treats finding something already mounted as "quitely drop our mount and return 0", not as "mount on top of whatever mounted there". It's been open-coded; let's take it into a helper similar to lock_mount(). "something's already mounted" => -EBUSY, finish_automount() needs to distinguish it from the normal case and it can't happen in other failure cases. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 90b62ee882da..6251ee15f5f6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3781,9 +3781,29 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, return err; } -int finish_automount(struct vfsmount *m, const struct path *path) +static int lock_mount_exact(const struct path *path, + struct pinned_mountpoint *mp) { struct dentry *dentry = path->dentry; + int err; + + inode_lock(dentry->d_inode); + namespace_lock(); + if (unlikely(cant_mount(dentry))) + err = -ENOENT; + else if (path_overmounted(path)) + err = -EBUSY; + else + err = get_mountpoint(dentry, mp); + if (unlikely(err)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + } + return err; +} + +int finish_automount(struct vfsmount *m, const struct path *path) +{ struct pinned_mountpoint mp = {}; struct mount *mnt; int err; @@ -3805,20 +3825,11 @@ int finish_automount(struct vfsmount *m, const struct path *path) * that overmounts our mountpoint to be means "quitely drop what we've * got", not "try to mount it on top". */ - inode_lock(dentry->d_inode); - namespace_lock(); - if (unlikely(cant_mount(dentry))) { - err = -ENOENT; - goto discard_locked; + err = lock_mount_exact(path, &mp); + if (unlikely(err)) { + mntput(m); + return err == -EBUSY ? 0 : err; } - if (path_overmounted(path)) { - err = 0; - goto discard_locked; - } - err = get_mountpoint(dentry, &mp); - if (err) - goto discard_locked; - err = do_add_mount(mnt, mp.mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); unlock_mount(&mp); @@ -3826,9 +3837,6 @@ int finish_automount(struct vfsmount *m, const struct path *path) goto discard; return 0; -discard_locked: - namespace_unlock(); - inode_unlock(dentry->d_inode); discard: mntput(m); return err; From 308a022f41bd3f12ceeda34f6d36bf8f2601d9ae Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 21 Aug 2025 22:43:32 -0400 Subject: [PATCH 26/64] do_new_mount_fc(): use __free() to deal with dropping mnt on failure do_add_mount() consumes vfsmount on success; just follow it with conditional retain_and_null_ptr() on success and we can switch to __free() for mnt and be done with that - unlock_mount() is in the very end. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 6251ee15f5f6..3551e51461a2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3696,7 +3696,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, { struct pinned_mountpoint mp = {}; struct super_block *sb; - struct vfsmount *mnt = fc_mount(fc); + struct vfsmount *mnt __free(mntput) = fc_mount(fc); int error; if (IS_ERR(mnt)) @@ -3704,10 +3704,11 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, sb = fc->root->d_sb; error = security_sb_kern_mount(sb); - if (!error && mount_too_revealing(sb, &mnt_flags)) - error = -EPERM; if (unlikely(error)) - goto out; + return error; + + if (unlikely(mount_too_revealing(sb, &mnt_flags))) + return -EPERM; mnt_warn_timestamp_expiry(mountpoint, mnt); @@ -3716,11 +3717,9 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, error = do_add_mount(real_mount(mnt), mp.mp, mountpoint, mnt_flags); if (!error) - mnt = NULL; // consumed on success + retain_and_null_ptr(mnt); // consumed on success unlock_mount(&mp); } -out: - mntput(mnt); return error; } From f1f486b841c73e94167956e1885f865cac4e629f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 21 Aug 2025 22:53:06 -0400 Subject: [PATCH 27/64] finish_automount(): use __free() to deal with dropping mnt on failure same story as with do_new_mount_fc(). Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 3551e51461a2..779cfed04291 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3801,8 +3801,9 @@ static int lock_mount_exact(const struct path *path, return err; } -int finish_automount(struct vfsmount *m, const struct path *path) +int finish_automount(struct vfsmount *__m, const struct path *path) { + struct vfsmount *m __free(mntput) = __m; struct pinned_mountpoint mp = {}; struct mount *mnt; int err; @@ -3814,10 +3815,8 @@ int finish_automount(struct vfsmount *m, const struct path *path) mnt = real_mount(m); - if (m->mnt_root == path->dentry) { - err = -ELOOP; - goto discard; - } + if (m->mnt_root == path->dentry) + return -ELOOP; /* * we don't want to use lock_mount() - in this case finding something @@ -3825,19 +3824,14 @@ int finish_automount(struct vfsmount *m, const struct path *path) * got", not "try to mount it on top". */ err = lock_mount_exact(path, &mp); - if (unlikely(err)) { - mntput(m); + if (unlikely(err)) return err == -EBUSY ? 0 : err; - } + err = do_add_mount(mnt, mp.mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + if (likely(!err)) + retain_and_null_ptr(m); unlock_mount(&mp); - if (unlikely(err)) - goto discard; - return 0; - -discard: - mntput(m); return err; } From 2010464cfafb22821e39c216cb068d83c91d7d8b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 13:42:24 -0400 Subject: [PATCH 28/64] change calling conventions for lock_mount() et.al. 1) pinned_mountpoint gets a new member - struct mount *parent. Set only if we locked the sucker; ERR_PTR() - on failed attempt. 2) do_lock_mount() et.al. return void and set ->parent to * on success with !beneath - mount corresponding to path->mnt * on success with beneath - the parent of mount corresponding to path->mnt * in case of error - ERR_PTR(-E...). IOW, we get the mount we will be actually mounting upon or ERR_PTR(). 3) we can't use CLASS, since the pinned_mountpoint is placed on hlist during initialization, so we define local macros: LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) LOCK_MOUNT_EXACT(mp, path) All of them declare and initialize struct pinned_mountpoint mp, with unlock_mount done via __cleanup(). Users converted. [ lock_mount() is unused now; removed. Reported-by: Andy Shevchenko ] Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 222 ++++++++++++++++++++++++------------------------- 1 file changed, 107 insertions(+), 115 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 779cfed04291..a6f48750536a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -919,6 +919,7 @@ bool __is_local_mountpoint(const struct dentry *dentry) struct pinned_mountpoint { struct hlist_node node; struct mountpoint *mp; + struct mount *parent; }; static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) @@ -2728,48 +2729,47 @@ static int attach_recursive_mnt(struct mount *source_mnt, } /** - * do_lock_mount - lock mount and mountpoint - * @path: target path - * @beneath: whether the intention is to mount beneath @path + * do_lock_mount - acquire environment for mounting + * @path: target path + * @res: context to set up + * @beneath: whether the intention is to mount beneath @path * - * Follow the mount stack on @path until the top mount @mnt is found. If - * the initial @path->{mnt,dentry} is a mountpoint lookup the first - * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root} - * until nothing is stacked on top of it anymore. + * To mount something at given location, we need + * namespace_sem locked exclusive + * inode of dentry we are mounting on locked exclusive + * struct mountpoint for that dentry + * struct mount we are mounting on * - * Acquire the inode_lock() on the top mount's ->mnt_root to protect - * against concurrent removal of the new mountpoint from another mount - * namespace. + * Results are stored in caller-supplied context (pinned_mountpoint); + * on success we have res->parent and res->mp pointing to parent and + * mountpoint respectively and res->node inserted into the ->m_list + * of the mountpoint, making sure the mountpoint won't disappear. + * On failure we have res->parent set to ERR_PTR(-E...), res->mp + * left NULL, res->node - empty. + * In case of success do_lock_mount returns with locks acquired (in + * proper order - inode lock nests outside of namespace_sem). * - * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint - * @mp on @mnt->mnt_parent must be acquired. This protects against a - * concurrent unlink of @mp->mnt_dentry from another mount namespace - * where @mnt doesn't have a child mount mounted @mp. A concurrent - * removal of @mnt->mnt_root doesn't matter as nothing will be mounted - * on top of it for @beneath. + * Request to mount on overmounted location is treated as "mount on + * top of whatever's overmounting it"; request to mount beneath + * a location - "mount immediately beneath the topmost mount at that + * place". * - * In addition, @beneath needs to make sure that @mnt hasn't been - * unmounted or moved from its current mountpoint in between dropping - * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt - * being unmounted would be detected later by e.g., calling - * check_mnt(mnt) in the function it's called from. For the @beneath - * case however, it's useful to detect it directly in do_lock_mount(). - * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points - * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will - * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL. - * - * Return: Either the target mountpoint on the top mount or the top - * mount's mountpoint. + * In all cases the location must not have been unmounted and the + * chosen mountpoint must be allowed to be mounted on. For "beneath" + * case we also require the location to be at the root of a mount + * that has a parent (i.e. is not a root of some namespace). */ -static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath) +static void do_lock_mount(struct path *path, struct pinned_mountpoint *res, bool beneath) { struct vfsmount *mnt = path->mnt; struct dentry *dentry; struct path under = {}; int err = -ENOENT; - if (unlikely(beneath) && !path_mounted(path)) - return -EINVAL; + if (unlikely(beneath) && !path_mounted(path)) { + res->parent = ERR_PTR(-EINVAL); + return; + } for (;;) { struct mount *m = real_mount(mnt); @@ -2779,7 +2779,8 @@ static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bo read_seqlock_excl(&mount_lock); if (unlikely(!mnt_has_parent(m))) { read_sequnlock_excl(&mount_lock); - return -EINVAL; + res->parent = ERR_PTR(-EINVAL); + return; } under.mnt = mntget(&m->mnt_parent->mnt); under.dentry = dget(m->mnt_mountpoint); @@ -2811,7 +2812,7 @@ static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bo path->dentry = dget(mnt->mnt_root); continue; // got overmounted } - err = get_mountpoint(dentry, pinned); + err = get_mountpoint(dentry, res); if (err) break; if (beneath) { @@ -2822,22 +2823,20 @@ static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bo * we are not dropping the final references here). */ path_put(&under); + res->parent = real_mount(path->mnt)->mnt_parent; + return; } - return 0; + res->parent = real_mount(path->mnt); + return; } namespace_unlock(); inode_unlock(dentry->d_inode); if (beneath) path_put(&under); - return err; + res->parent = ERR_PTR(err); } -static inline int lock_mount(struct path *path, struct pinned_mountpoint *m) -{ - return do_lock_mount(path, m, false); -} - -static void unlock_mount(struct pinned_mountpoint *m) +static void __unlock_mount(struct pinned_mountpoint *m) { inode_unlock(m->mp->m_dentry->d_inode); read_seqlock_excl(&mount_lock); @@ -2846,6 +2845,20 @@ static void unlock_mount(struct pinned_mountpoint *m) namespace_unlock(); } +static inline void unlock_mount(struct pinned_mountpoint *m) +{ + if (!IS_ERR(m->parent)) + __unlock_mount(m); +} + +#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \ + struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ + do_lock_mount((path), &mp, (beneath)) +#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false) +#define LOCK_MOUNT_EXACT(mp, path) \ + struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ + lock_mount_exact((path), &mp) + static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) { if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER) @@ -3015,8 +3028,7 @@ static int do_loopback(struct path *path, const char *old_name, int recurse) { struct path old_path __free(path_put) = {}; - struct mount *mnt = NULL, *parent; - struct pinned_mountpoint mp = {}; + struct mount *mnt = NULL; int err; if (!old_name || !*old_name) return -EINVAL; @@ -3027,28 +3039,23 @@ static int do_loopback(struct path *path, const char *old_name, if (mnt_ns_loop(old_path.dentry)) return -EINVAL; - err = lock_mount(path, &mp); - if (err) - return err; + LOCK_MOUNT(mp, path); + if (IS_ERR(mp.parent)) + return PTR_ERR(mp.parent); - parent = real_mount(path->mnt); - if (!check_mnt(parent)) - goto out2; + if (!check_mnt(mp.parent)) + return -EINVAL; mnt = __do_loopback(&old_path, recurse); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); - goto out2; - } + if (IS_ERR(mnt)) + return PTR_ERR(mnt); - err = graft_tree(mnt, parent, mp.mp); + err = graft_tree(mnt, mp.parent, mp.mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } -out2: - unlock_mount(&mp); return err; } @@ -3561,7 +3568,6 @@ static int do_move_mount(struct path *old_path, { struct mount *p; struct mount *old = real_mount(old_path->mnt); - struct pinned_mountpoint mp; int err; bool beneath = flags & MNT_TREE_BENEATH; @@ -3571,52 +3577,49 @@ static int do_move_mount(struct path *old_path, if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) return -EINVAL; - err = do_lock_mount(new_path, &mp, beneath); - if (err) - return err; + LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath); + if (IS_ERR(mp.parent)) + return PTR_ERR(mp.parent); p = real_mount(new_path->mnt); - err = -EINVAL; - if (check_mnt(old)) { /* if the source is in our namespace... */ /* ... it should be detachable from parent */ if (!mnt_has_parent(old) || IS_MNT_LOCKED(old)) - goto out; + return -EINVAL; /* ... which should not be shared */ if (IS_MNT_SHARED(old->mnt_parent)) - goto out; + return -EINVAL; /* ... and the target should be in our namespace */ if (!check_mnt(p)) - goto out; + return -EINVAL; } else { /* * otherwise the source must be the root of some anon namespace. */ if (!anon_ns_root(old)) - goto out; + return -EINVAL; /* * Bail out early if the target is within the same namespace - * subsequent checks would've rejected that, but they lose * some corner cases if we check it early. */ if (old->mnt_ns == p->mnt_ns) - goto out; + return -EINVAL; /* * Target should be either in our namespace or in an acceptable * anon namespace, sensu check_anonymous_mnt(). */ if (!may_use_mount(p)) - goto out; + return -EINVAL; } if (beneath) { err = can_move_mount_beneath(old, new_path, mp.mp); if (err) - goto out; + return err; - err = -EINVAL; p = p->mnt_parent; } @@ -3625,17 +3628,13 @@ static int do_move_mount(struct path *old_path, * mount which is shared. */ if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) - goto out; - err = -ELOOP; + return -EINVAL; if (!check_for_nsfs_mounts(old)) - goto out; + return -ELOOP; if (mount_is_ancestor(old, p)) - goto out; + return -ELOOP; - err = attach_recursive_mnt(old, p, mp.mp); -out: - unlock_mount(&mp); - return err; + return attach_recursive_mnt(old, p, mp.mp); } static int do_move_mount_old(struct path *path, const char *old_name) @@ -3694,7 +3693,6 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { - struct pinned_mountpoint mp = {}; struct super_block *sb; struct vfsmount *mnt __free(mntput) = fc_mount(fc); int error; @@ -3712,13 +3710,14 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, mnt_warn_timestamp_expiry(mountpoint, mnt); - error = lock_mount(mountpoint, &mp); - if (!error) { + LOCK_MOUNT(mp, mountpoint); + if (IS_ERR(mp.parent)) { + return PTR_ERR(mp.parent); + } else { error = do_add_mount(real_mount(mnt), mp.mp, mountpoint, mnt_flags); if (!error) retain_and_null_ptr(mnt); // consumed on success - unlock_mount(&mp); } return error; } @@ -3780,8 +3779,8 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, return err; } -static int lock_mount_exact(const struct path *path, - struct pinned_mountpoint *mp) +static void lock_mount_exact(const struct path *path, + struct pinned_mountpoint *mp) { struct dentry *dentry = path->dentry; int err; @@ -3797,14 +3796,15 @@ static int lock_mount_exact(const struct path *path, if (unlikely(err)) { namespace_unlock(); inode_unlock(dentry->d_inode); + mp->parent = ERR_PTR(err); + } else { + mp->parent = real_mount(path->mnt); } - return err; } int finish_automount(struct vfsmount *__m, const struct path *path) { struct vfsmount *m __free(mntput) = __m; - struct pinned_mountpoint mp = {}; struct mount *mnt; int err; @@ -3819,19 +3819,18 @@ int finish_automount(struct vfsmount *__m, const struct path *path) return -ELOOP; /* - * we don't want to use lock_mount() - in this case finding something + * we don't want to use LOCK_MOUNT() - in this case finding something * that overmounts our mountpoint to be means "quitely drop what we've * got", not "try to mount it on top". */ - err = lock_mount_exact(path, &mp); - if (unlikely(err)) - return err == -EBUSY ? 0 : err; + LOCK_MOUNT_EXACT(mp, path); + if (IS_ERR(mp.parent)) + return mp.parent == ERR_PTR(-EBUSY) ? 0 : PTR_ERR(mp.parent); err = do_add_mount(mnt, mp.mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); if (likely(!err)) retain_and_null_ptr(m); - unlock_mount(&mp); return err; } @@ -4627,7 +4626,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, struct path old __free(path_put) = {}; struct path root __free(path_put) = {}; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; - struct pinned_mountpoint old_mp = {}; int error; if (!may_mount()) @@ -4648,45 +4646,42 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, return error; get_fs_root(current->fs, &root); - error = lock_mount(&old, &old_mp); - if (error) - return error; - error = -EINVAL; + LOCK_MOUNT(old_mp, &old); + old_mnt = old_mp.parent; + if (IS_ERR(old_mnt)) + return PTR_ERR(old_mnt); + new_mnt = real_mount(new.mnt); root_mnt = real_mount(root.mnt); - old_mnt = real_mount(old.mnt); ex_parent = new_mnt->mnt_parent; root_parent = root_mnt->mnt_parent; if (IS_MNT_SHARED(old_mnt) || IS_MNT_SHARED(ex_parent) || IS_MNT_SHARED(root_parent)) - goto out4; + return -EINVAL; if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) - goto out4; + return -EINVAL; if (new_mnt->mnt.mnt_flags & MNT_LOCKED) - goto out4; - error = -ENOENT; + return -EINVAL; if (d_unlinked(new.dentry)) - goto out4; - error = -EBUSY; + return -ENOENT; if (new_mnt == root_mnt || old_mnt == root_mnt) - goto out4; /* loop, on the same file system */ - error = -EINVAL; + return -EBUSY; /* loop, on the same file system */ if (!path_mounted(&root)) - goto out4; /* not a mountpoint */ + return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) - goto out4; /* absolute root */ + return -EINVAL; /* absolute root */ if (!path_mounted(&new)) - goto out4; /* not a mountpoint */ + return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) - goto out4; /* absolute root */ + return -EINVAL; /* absolute root */ /* make sure we can reach put_old from new_root */ if (!is_path_reachable(old_mnt, old.dentry, &new)) - goto out4; + return -EINVAL; /* make certain new is below the root */ if (!is_path_reachable(new_mnt, new.dentry, &root)) - goto out4; + return -EINVAL; lock_mount_hash(); umount_mnt(new_mnt); if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { @@ -4705,10 +4700,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, mnt_notify_add(root_mnt); mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); - error = 0; -out4: - unlock_mount(&old_mp); - return error; + return 0; } static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) From 842e12352c3074b8af1bc95cc2bb1e9fb47f4334 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 13:47:43 -0400 Subject: [PATCH 29/64] do_move_mount(): use the parent mount returned by do_lock_mount() After successful do_lock_mount() call, mp.parent is set to either real_mount(path->mnt) (for !beneath case) or to ->mnt_parent of that (for beneath). p is set to real_mount(path->mnt) and after several uses it's made equal to mp.parent. All uses prior to that care only about p->mnt_ns and since p->mnt_ns == parent->mnt_ns, we might as well use mp.parent all along. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index a6f48750536a..bae4be8c7192 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3566,7 +3566,6 @@ static inline bool may_use_mount(struct mount *mnt) static int do_move_mount(struct path *old_path, struct path *new_path, enum mnt_tree_flags_t flags) { - struct mount *p; struct mount *old = real_mount(old_path->mnt); int err; bool beneath = flags & MNT_TREE_BENEATH; @@ -3581,8 +3580,6 @@ static int do_move_mount(struct path *old_path, if (IS_ERR(mp.parent)) return PTR_ERR(mp.parent); - p = real_mount(new_path->mnt); - if (check_mnt(old)) { /* if the source is in our namespace... */ /* ... it should be detachable from parent */ @@ -3592,7 +3589,7 @@ static int do_move_mount(struct path *old_path, if (IS_MNT_SHARED(old->mnt_parent)) return -EINVAL; /* ... and the target should be in our namespace */ - if (!check_mnt(p)) + if (!check_mnt(mp.parent)) return -EINVAL; } else { /* @@ -3605,13 +3602,13 @@ static int do_move_mount(struct path *old_path, * subsequent checks would've rejected that, but they lose * some corner cases if we check it early. */ - if (old->mnt_ns == p->mnt_ns) + if (old->mnt_ns == mp.parent->mnt_ns) return -EINVAL; /* * Target should be either in our namespace or in an acceptable * anon namespace, sensu check_anonymous_mnt(). */ - if (!may_use_mount(p)) + if (!may_use_mount(mp.parent)) return -EINVAL; } @@ -3619,22 +3616,20 @@ static int do_move_mount(struct path *old_path, err = can_move_mount_beneath(old, new_path, mp.mp); if (err) return err; - - p = p->mnt_parent; } /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ - if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) + if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(old)) return -EINVAL; if (!check_for_nsfs_mounts(old)) return -ELOOP; - if (mount_is_ancestor(old, p)) + if (mount_is_ancestor(old, mp.parent)) return -ELOOP; - return attach_recursive_mnt(old, p, mp.mp); + return attach_recursive_mnt(old, mp.parent, mp.mp); } static int do_move_mount_old(struct path *path, const char *old_name) From ef307f89bfb662109a25acdde5b5837fe84a3a64 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Aug 2025 21:21:24 -0400 Subject: [PATCH 30/64] do_add_mount(): switch to passing pinned_mountpoint instead of mountpoint + path Both callers pass it a mountpoint reference picked from pinned_mountpoint and path it corresponds to. First of all, path->dentry is equal to mp.mp->m_dentry. Furthermore, path->mnt is &mp.parent->mnt, making struct path contents redundant. Pass it the address of that pinned_mountpoint instead; what's more, if we teach it to treat ERR_PTR(error) in ->parent as "bail out with that error" we can simplify the callers even more - do_add_mount() will do the right thing even when called after lock_mount() failure. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index bae4be8c7192..059ad02d214a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3652,10 +3652,13 @@ static int do_move_mount_old(struct path *path, const char *old_name) /* * add a mount into a namespace's mount tree */ -static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, - const struct path *path, int mnt_flags) +static int do_add_mount(struct mount *newmnt, const struct pinned_mountpoint *mp, + int mnt_flags) { - struct mount *parent = real_mount(path->mnt); + struct mount *parent = mp->parent; + + if (IS_ERR(parent)) + return PTR_ERR(parent); mnt_flags &= ~MNT_INTERNAL_FLAGS; @@ -3669,14 +3672,15 @@ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, } /* Refuse the same filesystem on the same mount point */ - if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path)) + if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb && + parent->mnt.mnt_root == mp->mp->m_dentry) return -EBUSY; if (d_is_symlink(newmnt->mnt.mnt_root)) return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; - return graft_tree(newmnt, parent, mp); + return graft_tree(newmnt, parent, mp->mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); @@ -3706,14 +3710,9 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, mnt_warn_timestamp_expiry(mountpoint, mnt); LOCK_MOUNT(mp, mountpoint); - if (IS_ERR(mp.parent)) { - return PTR_ERR(mp.parent); - } else { - error = do_add_mount(real_mount(mnt), mp.mp, - mountpoint, mnt_flags); - if (!error) - retain_and_null_ptr(mnt); // consumed on success - } + error = do_add_mount(real_mount(mnt), &mp, mnt_flags); + if (!error) + retain_and_null_ptr(mnt); // consumed on success return error; } @@ -3819,11 +3818,10 @@ int finish_automount(struct vfsmount *__m, const struct path *path) * got", not "try to mount it on top". */ LOCK_MOUNT_EXACT(mp, path); - if (IS_ERR(mp.parent)) - return mp.parent == ERR_PTR(-EBUSY) ? 0 : PTR_ERR(mp.parent); + if (mp.parent == ERR_PTR(-EBUSY)) + return 0; - err = do_add_mount(mnt, mp.mp, path, - path->mnt->mnt_flags | MNT_SHRINKABLE); + err = do_add_mount(mnt, &mp, path->mnt->mnt_flags | MNT_SHRINKABLE); if (likely(!err)) retain_and_null_ptr(m); return err; From 6bfb6938e2ffd2ae0b9962aa7cc94c0808eeb100 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 22 Aug 2025 19:03:29 -0400 Subject: [PATCH 31/64] graft_tree(), attach_recursive_mnt() - pass pinned_mountpoint parent and mountpoint always come from the same struct pinned_mountpoint now. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 059ad02d214a..a931bc9598bd 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2549,8 +2549,7 @@ enum mnt_tree_flags_t { /** * attach_recursive_mnt - attach a source mount tree * @source_mnt: mount tree to be attached - * @dest_mnt: mount that @source_mnt will be mounted on - * @dest_mp: the mountpoint @source_mnt will be mounted at + * @dest: the context for mounting at the place where the tree should go * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. @@ -2613,10 +2612,11 @@ enum mnt_tree_flags_t { * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct mount *dest_mnt, - struct mountpoint *dest_mp) + const struct pinned_mountpoint *dest) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; + struct mount *dest_mnt = dest->parent; + struct mountpoint *dest_mp = dest->mp; HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; struct pinned_mountpoint root = {}; @@ -2859,16 +2859,16 @@ static inline void unlock_mount(struct pinned_mountpoint *m) struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ lock_mount_exact((path), &mp) -static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) +static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp) { if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER) return -EINVAL; - if (d_is_dir(mp->m_dentry) != + if (d_is_dir(mp->mp->m_dentry) != d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; - return attach_recursive_mnt(mnt, p, mp); + return attach_recursive_mnt(mnt, mp); } static int may_change_propagation(const struct mount *m) @@ -3050,7 +3050,7 @@ static int do_loopback(struct path *path, const char *old_name, if (IS_ERR(mnt)) return PTR_ERR(mnt); - err = graft_tree(mnt, mp.parent, mp.mp); + err = graft_tree(mnt, &mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); @@ -3629,7 +3629,7 @@ static int do_move_mount(struct path *old_path, if (mount_is_ancestor(old, mp.parent)) return -ELOOP; - return attach_recursive_mnt(old, mp.parent, mp.mp); + return attach_recursive_mnt(old, &mp); } static int do_move_mount_old(struct path *path, const char *old_name) @@ -3680,7 +3680,7 @@ static int do_add_mount(struct mount *newmnt, const struct pinned_mountpoint *mp return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; - return graft_tree(newmnt, parent, mp->mp); + return graft_tree(newmnt, mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); From a2bdb7d8dcf20810349e68824f399f5092269cf7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 00:26:44 -0400 Subject: [PATCH 32/64] pivot_root(2): use old_mp.mp->m_dentry instead of old.dentry That kills the last place where callers of lock_mount(path, &mp) used path->dentry. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index a931bc9598bd..8c5f5b1edced 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4670,7 +4670,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (!mnt_has_parent(new_mnt)) return -EINVAL; /* absolute root */ /* make sure we can reach put_old from new_root */ - if (!is_path_reachable(old_mnt, old.dentry, &new)) + if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new)) return -EINVAL; /* make certain new is below the root */ if (!is_path_reachable(new_mnt, new.dentry, &root)) From ed8ba4aad78887d88231c1c66c0ddf9fe12aaad1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 14:37:44 -0400 Subject: [PATCH 33/64] don't bother passing new_path->dentry to can_move_mount_beneath() Signed-off-by: Al Viro --- fs/namespace.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 8c5f5b1edced..b48bfb46b351 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3445,8 +3445,8 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) /** * can_move_mount_beneath - check that we can mount beneath the top mount * @mnt_from: mount we are trying to move - * @to: mount under which to mount - * @mp: mountpoint of @to + * @mnt_to: mount under which to mount + * @mp: mountpoint of @mnt_to * * - Make sure that nothing can be mounted beneath the caller's current * root or the rootfs of the namespace. @@ -3462,11 +3462,10 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) * Return: On success 0, and on error a negative error code is returned. */ static int can_move_mount_beneath(struct mount *mnt_from, - const struct path *to, + struct mount *mnt_to, const struct mountpoint *mp) { - struct mount *mnt_to = real_mount(to->mnt), - *parent_mnt_to = mnt_to->mnt_parent; + struct mount *parent_mnt_to = mnt_to->mnt_parent; if (IS_MNT_LOCKED(mnt_to)) return -EINVAL; @@ -3613,7 +3612,9 @@ static int do_move_mount(struct path *old_path, } if (beneath) { - err = can_move_mount_beneath(old, new_path, mp.mp); + struct mount *over = real_mount(new_path->mnt); + + err = can_move_mount_beneath(old, over, mp.mp); if (err) return err; } From 25423edc787842d17520b3f9df4d0a58a6a663b1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 03:23:12 -0400 Subject: [PATCH 34/64] new helper: topmost_overmount() Returns the final (topmost) mount in the chain of overmounts starting at given mount. Same locking rules as for any mount tree traversal - either the spinlock side of mount_lock, or rcu + sample the seqcount side of mount_lock before the call and recheck afterwards. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/mount.h | 7 +++++++ fs/namespace.c | 9 +++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index ed8c83ba836a..04d0eadc4c10 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -235,4 +235,11 @@ static inline void mnt_notify_add(struct mount *m) } #endif +static inline struct mount *topmost_overmount(struct mount *m) +{ + while (m->overmount) + m = m->overmount; + return m; +} + struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); diff --git a/fs/namespace.c b/fs/namespace.c index b48bfb46b351..b6983adaa73b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2696,10 +2696,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, child->mnt_mountpoint); commit_tree(child); if (q) { + struct mount *r = topmost_overmount(child); struct mountpoint *mp = root.mp; - struct mount *r = child; - while (unlikely(r->overmount)) - r = r->overmount; + if (unlikely(shorter) && child != source_mnt) mp = shorter; mnt_change_mountpoint(r, mp, q); @@ -6168,9 +6167,7 @@ bool current_chrooted(void) guard(mount_locked_reader)(); - root = current->nsproxy->mnt_ns->root; - while (unlikely(root->overmount)) - root = root->overmount; + root = topmost_overmount(current->nsproxy->mnt_ns->root); return fs_root.mnt != &root->mnt || !path_mounted(&fs_root); } From 90006f21b78ab30cdab8bc5202d293655ce4cfc4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 22 Aug 2025 17:41:55 -0400 Subject: [PATCH 35/64] do_lock_mount(): don't modify path. Currently do_lock_mount() has the target path switched to whatever might be overmounting it. We _do_ want to have the parent mount/mountpoint chosen on top of the overmounting pile; however, the way it's done has unpleasant races - if umount propagation removes the overmount while we'd been trying to set the environment up, we might end up failing if our target path strays into that overmount just before the overmount gets kicked out. Users of do_lock_mount() do not need the target path changed - they have all information in res->{parent,mp}; only one place (in do_move_mount()) currently uses the resulting path->mnt, and that value is trivial to reconstruct by the original value of path->mnt + chosen parent mount. Let's keep the target path unchanged; it avoids a bunch of subtle races and it's not hard to do: do as mount_locked_reader find the prospective parent mount/mountpoint dentry grab references if it's not the original target lock the prospective mountpoint dentry take namespace_sem exclusive if prospective parent/mountpoint would be different now err = -EAGAIN else if location has been unmounted err = -ENOENT else if mountpoint dentry is not allowed to be mounted on err = -ENOENT else if beneath and the top of the pile was the absolute root err = -EINVAL else try to get struct mountpoint (by dentry), set err to 0 on success and -ENO{MEM,ENT} on failure if err != 0 res->parent = ERR_PTR(err) drop locks else res->parent = prospective parent drop temporary references while err == -EAGAIN A somewhat subtle part is that dropping temporary references is allowed. Neither mounts nor dentries should be evicted by a thread that holds namespace_sem. On success we are dropping those references under namespace_sem, so we need to be sure that these are not the last references remaining. However, on success we'd already verified (under namespace_sem) that original target is still mounted and that mount and dentry we are about to drop are still reachable from it via the mount tree. That guarantees that we are not about to drop the last remaining references. Signed-off-by: Al Viro --- fs/namespace.c | 119 ++++++++++++++++++++++++++----------------------- 1 file changed, 63 insertions(+), 56 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index b6983adaa73b..47c7a0517663 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2727,6 +2727,27 @@ static int attach_recursive_mnt(struct mount *source_mnt, return err; } +static inline struct mount *where_to_mount(const struct path *path, + struct dentry **dentry, + bool beneath) +{ + struct mount *m; + + if (unlikely(beneath)) { + m = topmost_overmount(real_mount(path->mnt)); + *dentry = m->mnt_mountpoint; + return m->mnt_parent; + } + m = __lookup_mnt(path->mnt, path->dentry); + if (unlikely(m)) { + m = topmost_overmount(m); + *dentry = m->mnt.mnt_root; + return m; + } + *dentry = path->dentry; + return real_mount(path->mnt); +} + /** * do_lock_mount - acquire environment for mounting * @path: target path @@ -2758,81 +2779,65 @@ static int attach_recursive_mnt(struct mount *source_mnt, * case we also require the location to be at the root of a mount * that has a parent (i.e. is not a root of some namespace). */ -static void do_lock_mount(struct path *path, struct pinned_mountpoint *res, bool beneath) +static void do_lock_mount(const struct path *path, + struct pinned_mountpoint *res, + bool beneath) { - struct vfsmount *mnt = path->mnt; - struct dentry *dentry; - struct path under = {}; - int err = -ENOENT; + int err; if (unlikely(beneath) && !path_mounted(path)) { res->parent = ERR_PTR(-EINVAL); return; } - for (;;) { - struct mount *m = real_mount(mnt); + do { + struct dentry *dentry, *d; + struct mount *m, *n; - if (beneath) { - path_put(&under); - read_seqlock_excl(&mount_lock); - if (unlikely(!mnt_has_parent(m))) { - read_sequnlock_excl(&mount_lock); - res->parent = ERR_PTR(-EINVAL); - return; + scoped_guard(mount_locked_reader) { + m = where_to_mount(path, &dentry, beneath); + if (&m->mnt != path->mnt) { + mntget(&m->mnt); + dget(dentry); } - under.mnt = mntget(&m->mnt_parent->mnt); - under.dentry = dget(m->mnt_mountpoint); - read_sequnlock_excl(&mount_lock); - dentry = under.dentry; - } else { - dentry = path->dentry; } inode_lock(dentry->d_inode); namespace_lock(); - if (unlikely(cant_mount(dentry) || !is_mounted(mnt))) - break; // not to be mounted on + // check if the chain of mounts (if any) has changed. + scoped_guard(mount_locked_reader) + n = where_to_mount(path, &d, beneath); - if (beneath && unlikely(m->mnt_mountpoint != dentry || - &m->mnt_parent->mnt != under.mnt)) { + if (unlikely(n != m || dentry != d)) + err = -EAGAIN; // something moved, retry + else if (unlikely(cant_mount(dentry) || !is_mounted(path->mnt))) + err = -ENOENT; // not to be mounted on + else if (beneath && &m->mnt == path->mnt && !m->overmount) + err = -EINVAL; + else + err = get_mountpoint(dentry, res); + + if (unlikely(err)) { + res->parent = ERR_PTR(err); namespace_unlock(); inode_unlock(dentry->d_inode); - continue; // got moved + } else { + res->parent = m; } - - mnt = lookup_mnt(path); - if (unlikely(mnt)) { - namespace_unlock(); - inode_unlock(dentry->d_inode); - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - continue; // got overmounted + /* + * Drop the temporary references. This is subtle - on success + * we are doing that under namespace_sem, which would normally + * be forbidden. However, in that case we are guaranteed that + * refcounts won't reach zero, since we know that path->mnt + * is mounted and thus all mounts reachable from it are pinned + * and stable, along with their mountpoints and roots. + */ + if (&m->mnt != path->mnt) { + dput(dentry); + mntput(&m->mnt); } - err = get_mountpoint(dentry, res); - if (err) - break; - if (beneath) { - /* - * @under duplicates the references that will stay - * at least until namespace_unlock(), so the path_put() - * below is safe (and OK to do under namespace_lock - - * we are not dropping the final references here). - */ - path_put(&under); - res->parent = real_mount(path->mnt)->mnt_parent; - return; - } - res->parent = real_mount(path->mnt); - return; - } - namespace_unlock(); - inode_unlock(dentry->d_inode); - if (beneath) - path_put(&under); - res->parent = ERR_PTR(err); + } while (err == -EAGAIN); } static void __unlock_mount(struct pinned_mountpoint *m) @@ -3613,6 +3618,8 @@ static int do_move_mount(struct path *old_path, if (beneath) { struct mount *over = real_mount(new_path->mnt); + if (mp.parent != over->mnt_parent) + over = mp.parent->overmount; err = can_move_mount_beneath(old, over, mp.mp); if (err) return err; From 8be87700c9801559893d75ed6c8ae6685db60f1d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 14:48:46 -0400 Subject: [PATCH 36/64] constify check_mnt() Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 47c7a0517663..10949806ad7a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1010,7 +1010,7 @@ static void unpin_mountpoint(struct pinned_mountpoint *m) } } -static inline int check_mnt(struct mount *mnt) +static inline int check_mnt(const struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; } From 08404199f3f232828dcd6614e34a0c154cddb13d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Jul 2025 17:04:02 -0400 Subject: [PATCH 37/64] do_mount_setattr(): constify path argument Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 10949806ad7a..5d5482a82b4d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4859,7 +4859,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) touch_mnt_namespace(mnt->mnt_ns); } -static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) +static int do_mount_setattr(const struct path *path, struct mount_kattr *kattr) { struct mount *mnt = real_mount(path->mnt); int err = 0; From 6e024a0e280e1ce3c18579b73735cf78bb7f1ead Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Jul 2025 17:09:39 -0400 Subject: [PATCH 38/64] do_set_group(): constify path arguments Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 5d5482a82b4d..62cffd3b6de2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3353,7 +3353,7 @@ static inline int tree_contains_unbindable(struct mount *mnt) return 0; } -static int do_set_group(struct path *from_path, struct path *to_path) +static int do_set_group(const struct path *from_path, const struct path *to_path) { struct mount *from = real_mount(from_path->mnt); struct mount *to = real_mount(to_path->mnt); From 1f6df5847454dee8608f78ee0df7352472cb2447 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Jul 2025 18:45:02 -0400 Subject: [PATCH 39/64] drop_collected_paths(): constify arguments ... and use that to constify the pointers in callers Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- include/linux/mount.h | 2 +- kernel/audit_tree.c | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 62cffd3b6de2..346e577073bb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2334,9 +2334,9 @@ struct path *collect_paths(const struct path *path, return res; } -void drop_collected_paths(struct path *paths, struct path *prealloc) +void drop_collected_paths(const struct path *paths, struct path *prealloc) { - for (struct path *p = paths; p->mnt; p++) + for (const struct path *p = paths; p->mnt; p++) path_put(p); if (paths != prealloc) kfree(paths); diff --git a/include/linux/mount.h b/include/linux/mount.h index 5f9c053b0897..c09032463b36 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -105,7 +105,7 @@ extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); extern struct path *collect_paths(const struct path *, struct path *, unsigned); -extern void drop_collected_paths(struct path *, struct path *); +extern void drop_collected_paths(const struct path *, struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b0eae2a3c895..32007edf0e55 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -678,7 +678,7 @@ void audit_trim_trees(void) struct audit_tree *tree; struct path path; struct audit_node *node; - struct path *paths; + const struct path *paths; struct path array[16]; int err; @@ -701,7 +701,7 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - for (struct path *p = paths; p->dentry; p++) { + for (const struct path *p = paths; p->dentry; p++) { struct inode *inode = p->dentry->d_inode; if (inode_to_key(inode) == chunk->key) { node->index &= ~(1U<<31); @@ -740,9 +740,9 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mounts(struct path *paths, struct audit_tree *tree) +static int tag_mounts(const struct path *paths, struct audit_tree *tree) { - for (struct path *p = paths; p->dentry; p++) { + for (const struct path *p = paths; p->dentry; p++) { int err = tag_chunk(p->dentry->d_inode, tree); if (err) return err; @@ -805,7 +805,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct audit_tree *seed = rule->tree, *tree; struct path path; struct path array[16]; - struct path *paths; + const struct path *paths; int err; rule->tree = NULL; @@ -877,7 +877,7 @@ int audit_tag_tree(char *old, char *new) int failed = 0; struct path path1, path2; struct path array[16]; - struct path *paths; + const struct path *paths; int err; err = kern_path(new, 0, &path2); From b42ffcd5069d5cfb777b8982a1c55c7e2f1d3998 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2025 19:34:37 -0400 Subject: [PATCH 40/64] collect_paths(): constify the return value callers have no business modifying the paths they get Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- include/linux/mount.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 346e577073bb..cf8dbf0741f0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2300,7 +2300,7 @@ static inline bool extend_array(struct path **res, struct path **to_free, return p; } -struct path *collect_paths(const struct path *path, +const struct path *collect_paths(const struct path *path, struct path *prealloc, unsigned count) { struct mount *root = real_mount(path->mnt); @@ -2334,7 +2334,7 @@ struct path *collect_paths(const struct path *path, return res; } -void drop_collected_paths(const struct path *paths, struct path *prealloc) +void drop_collected_paths(const struct path *paths, const struct path *prealloc) { for (const struct path *p = paths; p->mnt; p++) path_put(p); diff --git a/include/linux/mount.h b/include/linux/mount.h index c09032463b36..18e4b97f8a98 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -104,8 +104,8 @@ extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); -extern struct path *collect_paths(const struct path *, struct path *, unsigned); -extern void drop_collected_paths(const struct path *, struct path *); +extern const struct path *collect_paths(const struct path *, struct path *, unsigned); +extern void drop_collected_paths(const struct path *, const struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); From 44b58cdaf99264a99e9f07e90b944ecdd8081c2a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Jul 2025 17:13:15 -0400 Subject: [PATCH 41/64] do_move_mount(), vfs_move_mount(), do_move_mount_old(): constify struct path argument(s) Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index cf8dbf0741f0..c1c906cfcb94 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3566,8 +3566,9 @@ static inline bool may_use_mount(struct mount *mnt) return check_anonymous_mnt(mnt); } -static int do_move_mount(struct path *old_path, - struct path *new_path, enum mnt_tree_flags_t flags) +static int do_move_mount(const struct path *old_path, + const struct path *new_path, + enum mnt_tree_flags_t flags) { struct mount *old = real_mount(old_path->mnt); int err; @@ -3639,7 +3640,7 @@ static int do_move_mount(struct path *old_path, return attach_recursive_mnt(old, &mp); } -static int do_move_mount_old(struct path *path, const char *old_name) +static int do_move_mount_old(const struct path *path, const char *old_name) { struct path old_path; int err; @@ -4469,7 +4470,8 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, return ret; } -static inline int vfs_move_mount(struct path *from_path, struct path *to_path, +static inline int vfs_move_mount(const struct path *from_path, + const struct path *to_path, enum mnt_tree_flags_t mflags) { int ret; From 27e4b785596602cebe9acd0fd2d4fb4f5f9a3e4d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2025 14:06:07 -0400 Subject: [PATCH 42/64] mnt_warn_timestamp_expiry(): constify struct path argument Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index c1c906cfcb94..ebd1e6c23695 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3224,7 +3224,8 @@ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) touch_mnt_namespace(mnt->mnt_ns); } -static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) +static void mnt_warn_timestamp_expiry(const struct path *mountpoint, + struct vfsmount *mnt) { struct super_block *sb = mnt->mnt_sb; From 17d44b452c4f0135b1b18f99e2d4fc607de59af4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2025 14:08:19 -0400 Subject: [PATCH 43/64] do_new_mount{,_fc}(): constify struct path argument Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ebd1e6c23695..4fafc6f8f942 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3698,7 +3698,7 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags * Create a new mount using a superblock configuration and request it * be added to the namespace tree. */ -static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, +static int do_new_mount_fc(struct fs_context *fc, const struct path *mountpoint, unsigned int mnt_flags) { struct super_block *sb; @@ -3729,8 +3729,9 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, * create a new mount for userspace and request it to be added into the * namespace's tree */ -static int do_new_mount(struct path *path, const char *fstype, int sb_flags, - int mnt_flags, const char *name, void *data) +static int do_new_mount(const struct path *path, const char *fstype, + int sb_flags, int mnt_flags, + const char *name, void *data) { struct file_system_type *type; struct fs_context *fc; From a8be822f61934b493bc1c8ed98aa3326b7d176df Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2025 14:13:52 -0400 Subject: [PATCH 44/64] do_{loopback,change_type,remount,reconfigure_mnt}(): constify struct path argument Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 4fafc6f8f942..dbaa80bfc89a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2908,7 +2908,7 @@ static int flags_to_propagation_type(int ms_flags) /* * recursively change the type of the mountpoint. */ -static int do_change_type(struct path *path, int ms_flags) +static int do_change_type(const struct path *path, int ms_flags) { struct mount *m; struct mount *mnt = real_mount(path->mnt); @@ -3028,8 +3028,8 @@ static struct mount *__do_loopback(struct path *old_path, int recurse) /* * do loopback mount. */ -static int do_loopback(struct path *path, const char *old_name, - int recurse) +static int do_loopback(const struct path *path, const char *old_name, + int recurse) { struct path old_path __free(path_put) = {}; struct mount *mnt = NULL; @@ -3259,7 +3259,7 @@ static void mnt_warn_timestamp_expiry(const struct path *mountpoint, * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND * to mount(2). */ -static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) +static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags) { struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); @@ -3296,7 +3296,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ -static int do_remount(struct path *path, int ms_flags, int sb_flags, +static int do_remount(const struct path *path, int ms_flags, int sb_flags, int mnt_flags, void *data) { int err; From 8ec7ee2e0be710317bbfeeae8c16a3834e78ab9c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2025 14:18:09 -0400 Subject: [PATCH 45/64] path_mount(): constify struct path argument now it finally can be done. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/internal.h | 2 +- fs/namespace.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index 38e8aab27bbd..fe88563b4822 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -84,7 +84,7 @@ void mnt_put_write_access_file(struct file *file); extern void dissolve_on_fput(struct vfsmount *); extern bool may_mount(void); -int path_mount(const char *dev_name, struct path *path, +int path_mount(const char *dev_name, const struct path *path, const char *type_page, unsigned long flags, void *data_page); int path_umount(struct path *path, int flags); diff --git a/fs/namespace.c b/fs/namespace.c index dbaa80bfc89a..37afd82af628 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4012,7 +4012,7 @@ static char *copy_mount_string(const void __user *data) * Therefore, if this magic number is present, it carries no information * and must be discarded. */ -int path_mount(const char *dev_name, struct path *path, +int path_mount(const char *dev_name, const struct path *path, const char *type_page, unsigned long flags, void *data_page) { unsigned int mnt_flags = 0, sb_flags; From 4f4b18af4c2e4abb7cb7047de5c6d7f5e4b48b05 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2025 14:23:24 -0400 Subject: [PATCH 46/64] may_copy_tree(), __do_loopback(): constify struct path argument Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 37afd82af628..e09688d63dac 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2984,7 +2984,7 @@ static int do_change_type(const struct path *path, int ms_flags) * * Returns true if the mount tree can be copied, false otherwise. */ -static inline bool may_copy_tree(struct path *path) +static inline bool may_copy_tree(const struct path *path) { struct mount *mnt = real_mount(path->mnt); const struct dentry_operations *d_op; @@ -3006,7 +3006,7 @@ static inline bool may_copy_tree(struct path *path) } -static struct mount *__do_loopback(struct path *old_path, int recurse) +static struct mount *__do_loopback(const struct path *old_path, int recurse) { struct mount *old = real_mount(old_path->mnt); From f91c433a5c1240b5ad48d2cb8b69d38453b63e00 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2025 14:24:59 -0400 Subject: [PATCH 47/64] path_umount(): constify struct path argument Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/internal.h | 2 +- fs/namespace.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index fe88563b4822..549e6bd453b0 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -86,7 +86,7 @@ extern bool may_mount(void); int path_mount(const char *dev_name, const struct path *path, const char *type_page, unsigned long flags, void *data_page); -int path_umount(struct path *path, int flags); +int path_umount(const struct path *path, int flags); int show_path(struct seq_file *m, struct dentry *root); diff --git a/fs/namespace.c b/fs/namespace.c index e09688d63dac..0b351af53300 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2084,7 +2084,7 @@ static int can_umount(const struct path *path, int flags) } // caller is responsible for flags being sane -int path_umount(struct path *path, int flags) +int path_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); int ret; From 86af25b01df1181ac4a81d6cbbc6d48a9cc7106f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2025 16:55:27 -0400 Subject: [PATCH 48/64] constify can_move_mount_beneath() arguments Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 0b351af53300..a97f8dc05864 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3466,8 +3466,8 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) * Context: This function expects namespace_lock() to be held. * Return: On success 0, and on error a negative error code is returned. */ -static int can_move_mount_beneath(struct mount *mnt_from, - struct mount *mnt_to, +static int can_move_mount_beneath(const struct mount *mnt_from, + const struct mount *mnt_to, const struct mountpoint *mp) { struct mount *parent_mnt_to = mnt_to->mnt_parent; From 43d672dbf1f20c1d09b4ee73498bb39442e70f18 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2025 14:00:12 -0400 Subject: [PATCH 49/64] do_move_mount_old(): use __free(path_put) Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index a97f8dc05864..ec4e95bab73d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3643,7 +3643,7 @@ static int do_move_mount(const struct path *old_path, static int do_move_mount_old(const struct path *path, const char *old_name) { - struct path old_path; + struct path old_path __free(path_put) = {}; int err; if (!old_name || !*old_name) @@ -3653,9 +3653,7 @@ static int do_move_mount_old(const struct path *path, const char *old_name) if (err) return err; - err = do_move_mount(&old_path, path, 0); - path_put(&old_path); - return err; + return do_move_mount(&old_path, path, 0); } /* From fc9d5efc4c620807af53d62285adc7d4bf9b9978 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2025 14:19:58 -0400 Subject: [PATCH 50/64] do_mount(): use __free(path_put) Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ec4e95bab73d..75856c7ce746 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4092,15 +4092,13 @@ int path_mount(const char *dev_name, const struct path *path, int do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { - struct path path; + struct path path __free(path_put) = {}; int ret; ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); if (ret) return ret; - ret = path_mount(dev_name, &path, type_page, flags, data_page); - path_put(&path); - return ret; + return path_mount(dev_name, &path, type_page, flags, data_page); } static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) From 75db7fd99075bf46f3bfeaa8f1aaa8b13b2590cf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Aug 2025 12:22:03 -0400 Subject: [PATCH 51/64] umount_tree(): take all victims out of propagation graph at once For each removed mount we need to calculate where the slaves will end up. To avoid duplicating that work, do it for all mounts to be removed at once, taking the mounts themselves out of propagation graph as we go, then do all transfers; the duplicate work on finding destinations is avoided since if we run into a mount that already had destination found, we don't need to trace the rest of the way. That's guaranteed O(removed mounts) for finding destinations and removing from propagation graph and O(surviving mounts that have master removed) for transfers. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 3 ++- fs/pnode.c | 67 +++++++++++++++++++++++++++++++++++++++----------- fs/pnode.h | 1 + 3 files changed, 55 insertions(+), 16 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 75856c7ce746..ae29cbf13e39 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1846,6 +1846,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); + bulk_make_private(&tmp_list); + while (!list_empty(&tmp_list)) { struct mnt_namespace *ns; bool disconnect; @@ -1870,7 +1872,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) umount_mnt(p); } } - change_mnt_propagation(p, MS_PRIVATE); if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); diff --git a/fs/pnode.c b/fs/pnode.c index edaf9d9d0eaf..5d91c3e58d2a 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -71,19 +71,6 @@ static inline bool will_be_unmounted(struct mount *m) return m->mnt.mnt_flags & MNT_UMOUNT; } -static struct mount *propagation_source(struct mount *mnt) -{ - do { - struct mount *m; - for (m = next_peer(mnt); m != mnt; m = next_peer(m)) { - if (!will_be_unmounted(m)) - return m; - } - mnt = mnt->mnt_master; - } while (mnt && will_be_unmounted(mnt)); - return mnt; -} - static void transfer_propagation(struct mount *mnt, struct mount *to) { struct hlist_node *p = NULL, *n; @@ -112,11 +99,10 @@ void change_mnt_propagation(struct mount *mnt, int type) return; } if (IS_MNT_SHARED(mnt)) { - if (type == MS_SLAVE || !hlist_empty(&mnt->mnt_slave_list)) - m = propagation_source(mnt); if (list_empty(&mnt->mnt_share)) { mnt_release_group_id(mnt); } else { + m = next_peer(mnt); list_del_init(&mnt->mnt_share); mnt->mnt_group_id = 0; } @@ -137,6 +123,57 @@ void change_mnt_propagation(struct mount *mnt, int type) } } +static struct mount *trace_transfers(struct mount *m) +{ + while (1) { + struct mount *next = next_peer(m); + + if (next != m) { + list_del_init(&m->mnt_share); + m->mnt_group_id = 0; + m->mnt_master = next; + } else { + if (IS_MNT_SHARED(m)) + mnt_release_group_id(m); + next = m->mnt_master; + } + hlist_del_init(&m->mnt_slave); + CLEAR_MNT_SHARED(m); + SET_MNT_MARK(m); + + if (!next || !will_be_unmounted(next)) + return next; + if (IS_MNT_MARKED(next)) + return next->mnt_master; + m = next; + } +} + +static void set_destinations(struct mount *m, struct mount *master) +{ + struct mount *next; + + while ((next = m->mnt_master) != master) { + m->mnt_master = master; + m = next; + } +} + +void bulk_make_private(struct list_head *set) +{ + struct mount *m; + + list_for_each_entry(m, set, mnt_list) + if (!IS_MNT_MARKED(m)) + set_destinations(m, trace_transfers(m)); + + list_for_each_entry(m, set, mnt_list) { + transfer_propagation(m, m->mnt_master); + m->mnt_master = NULL; + CLEAR_MNT_MARK(m); + } +} + static struct mount *__propagation_next(struct mount *m, struct mount *origin) { diff --git a/fs/pnode.h b/fs/pnode.h index 00ab153e3e9d..b029db225f33 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -42,6 +42,7 @@ static inline bool peers(const struct mount *m1, const struct mount *m2) } void change_mnt_propagation(struct mount *, int); +void bulk_make_private(struct list_head *); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct hlist_head *); void propagate_umount(struct list_head *); From fc812c40f5eeee81836eabc3cdd017a46fe39d4c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Jul 2025 22:45:09 -0400 Subject: [PATCH 52/64] ecryptfs: get rid of pointless mount references in ecryptfs dentries ->lower_path.mnt has the same value for all dentries on given ecryptfs instance and if somebody goes for mountpoint-crossing variant where that would not be true, we can deal with that when it happens (and _not_ with duplicating these reference into each dentry). As it is, we are better off just sticking a reference into ecryptfs-private part of superblock and keeping it pinned until ->kill_sb(). That way we can stick a reference to underlying dentry right into ->d_fsdata of ecryptfs one, getting rid of indirection through struct ecryptfs_dentry_info, along with the entire struct ecryptfs_dentry_info machinery. [kudos to Dan Carpenter for spotting a bug in ecryptfs_get_tree() part] Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/ecryptfs/dentry.c | 14 +------------- fs/ecryptfs/ecryptfs_kernel.h | 27 +++++++++++---------------- fs/ecryptfs/file.c | 15 +++++++-------- fs/ecryptfs/inode.c | 19 +++++-------------- fs/ecryptfs/main.c | 24 ++++++------------------ 5 files changed, 30 insertions(+), 69 deletions(-) diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 1dfd5b81d831..6648a924e31a 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -59,14 +59,6 @@ static int ecryptfs_d_revalidate(struct inode *dir, const struct qstr *name, return rc; } -struct kmem_cache *ecryptfs_dentry_info_cache; - -static void ecryptfs_dentry_free_rcu(struct rcu_head *head) -{ - kmem_cache_free(ecryptfs_dentry_info_cache, - container_of(head, struct ecryptfs_dentry_info, rcu)); -} - /** * ecryptfs_d_release * @dentry: The ecryptfs dentry @@ -75,11 +67,7 @@ static void ecryptfs_dentry_free_rcu(struct rcu_head *head) */ static void ecryptfs_d_release(struct dentry *dentry) { - struct ecryptfs_dentry_info *p = dentry->d_fsdata; - if (p) { - path_put(&p->lower_path); - call_rcu(&p->rcu, ecryptfs_dentry_free_rcu); - } + dput(dentry->d_fsdata); } const struct dentry_operations ecryptfs_dops = { diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 1f562e75d0e4..9e6ab0b41337 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -258,13 +258,6 @@ struct ecryptfs_inode_info { struct ecryptfs_crypt_stat crypt_stat; }; -/* dentry private data. Each dentry must keep track of a lower - * vfsmount too. */ -struct ecryptfs_dentry_info { - struct path lower_path; - struct rcu_head rcu; -}; - /** * ecryptfs_global_auth_tok - A key used to encrypt all new files under the mountpoint * @flags: Status flags @@ -348,6 +341,7 @@ struct ecryptfs_mount_crypt_stat { /* superblock private data. */ struct ecryptfs_sb_info { struct super_block *wsi_sb; + struct vfsmount *lower_mnt; struct ecryptfs_mount_crypt_stat mount_crypt_stat; }; @@ -494,22 +488,25 @@ ecryptfs_set_superblock_lower(struct super_block *sb, } static inline void -ecryptfs_set_dentry_private(struct dentry *dentry, - struct ecryptfs_dentry_info *dentry_info) +ecryptfs_set_dentry_lower(struct dentry *dentry, + struct dentry *lower_dentry) { - dentry->d_fsdata = dentry_info; + dentry->d_fsdata = lower_dentry; } static inline struct dentry * ecryptfs_dentry_to_lower(struct dentry *dentry) { - return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry; + return dentry->d_fsdata; } -static inline const struct path * -ecryptfs_dentry_to_lower_path(struct dentry *dentry) +static inline struct path +ecryptfs_lower_path(struct dentry *dentry) { - return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path; + return (struct path){ + .mnt = ecryptfs_superblock_to_private(dentry->d_sb)->lower_mnt, + .dentry = ecryptfs_dentry_to_lower(dentry) + }; } #define ecryptfs_printk(type, fmt, arg...) \ @@ -532,7 +529,6 @@ extern unsigned int ecryptfs_number_of_users; extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache; extern struct kmem_cache *ecryptfs_file_info_cache; -extern struct kmem_cache *ecryptfs_dentry_info_cache; extern struct kmem_cache *ecryptfs_inode_info_cache; extern struct kmem_cache *ecryptfs_sb_info_cache; extern struct kmem_cache *ecryptfs_header_cache; @@ -557,7 +553,6 @@ int ecryptfs_encrypt_and_encode_filename( size_t *encoded_name_size, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, const char *name, size_t name_size); -struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); void ecryptfs_dump_hex(char *data, int bytes); int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, int sg_size); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 5f8f96da09fe..7929411837cf 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -33,13 +33,12 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, struct iov_iter *to) { ssize_t rc; - const struct path *path; struct file *file = iocb->ki_filp; rc = generic_file_read_iter(iocb, to); if (rc >= 0) { - path = ecryptfs_dentry_to_lower_path(file->f_path.dentry); - touch_atime(path); + struct path path = ecryptfs_lower_path(file->f_path.dentry); + touch_atime(&path); } return rc; } @@ -59,12 +58,11 @@ static ssize_t ecryptfs_splice_read_update_atime(struct file *in, loff_t *ppos, size_t len, unsigned int flags) { ssize_t rc; - const struct path *path; rc = filemap_splice_read(in, ppos, pipe, len, flags); if (rc >= 0) { - path = ecryptfs_dentry_to_lower_path(in->f_path.dentry); - touch_atime(path); + struct path path = ecryptfs_lower_path(in->f_path.dentry); + touch_atime(&path); } return rc; } @@ -283,6 +281,7 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file) * ecryptfs_lookup() */ struct ecryptfs_file_info *file_info; struct file *lower_file; + struct path path; /* Released in ecryptfs_release or end of function if failure */ file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); @@ -292,8 +291,8 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file) "Error attempting to allocate memory\n"); return -ENOMEM; } - lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry), - file->f_flags, current_cred()); + path = ecryptfs_lower_path(ecryptfs_dentry); + lower_file = dentry_open(&path, file->f_flags, current_cred()); if (IS_ERR(lower_file)) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 72fbe1316ab8..d2b262dc485d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -327,24 +327,15 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, struct dentry *lower_dentry) { - const struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent); + struct dentry *lower_parent = ecryptfs_dentry_to_lower(dentry->d_parent); struct inode *inode, *lower_inode; - struct ecryptfs_dentry_info *dentry_info; int rc = 0; - dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); - if (!dentry_info) { - dput(lower_dentry); - return ERR_PTR(-ENOMEM); - } - fsstack_copy_attr_atime(d_inode(dentry->d_parent), - d_inode(path->dentry)); + d_inode(lower_parent)); BUG_ON(!d_count(lower_dentry)); - ecryptfs_set_dentry_private(dentry, dentry_info); - dentry_info->lower_path.mnt = mntget(path->mnt); - dentry_info->lower_path.dentry = lower_dentry; + ecryptfs_set_dentry_lower(dentry, lower_dentry); /* * negative dentry can go positive under us here - its parent is not @@ -1022,10 +1013,10 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap, { struct dentry *dentry = path->dentry; struct kstat lower_stat; + struct path lower_path = ecryptfs_lower_path(dentry); int rc; - rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry), - &lower_stat, request_mask, flags); + rc = vfs_getattr_nosec(&lower_path, &lower_stat, request_mask, flags); if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index eab1beb846d3..16ea14dd2c62 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -106,15 +106,14 @@ static int ecryptfs_init_lower_file(struct dentry *dentry, struct file **lower_file) { const struct cred *cred = current_cred(); - const struct path *path = ecryptfs_dentry_to_lower_path(dentry); + struct path path = ecryptfs_lower_path(dentry); int rc; - rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt, - cred); + rc = ecryptfs_privileged_open(lower_file, path.dentry, path.mnt, cred); if (rc) { printk(KERN_ERR "Error opening lower file " "for lower_dentry [0x%p] and lower_mnt [0x%p]; " - "rc = [%d]\n", path->dentry, path->mnt, rc); + "rc = [%d]\n", path.dentry, path.mnt, rc); (*lower_file) = NULL; } return rc; @@ -437,7 +436,6 @@ static int ecryptfs_get_tree(struct fs_context *fc) struct ecryptfs_fs_context *ctx = fc->fs_private; struct ecryptfs_sb_info *sbi = fc->s_fs_info; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; - struct ecryptfs_dentry_info *root_info; const char *err = "Getting sb failed"; struct inode *inode; struct path path; @@ -543,14 +541,8 @@ static int ecryptfs_get_tree(struct fs_context *fc) goto out_free; } - rc = -ENOMEM; - root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); - if (!root_info) - goto out_free; - - /* ->kill_sb() will take care of root_info */ - ecryptfs_set_dentry_private(s->s_root, root_info); - root_info->lower_path = path; + ecryptfs_set_dentry_lower(s->s_root, path.dentry); + ecryptfs_superblock_to_private(s)->lower_mnt = path.mnt; s->s_flags |= SB_ACTIVE; fc->root = dget(s->s_root); @@ -580,6 +572,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb) kill_anon_super(sb); if (!sb_info) return; + mntput(sb_info->lower_mnt); ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } @@ -667,11 +660,6 @@ static struct ecryptfs_cache_info { .name = "ecryptfs_file_cache", .size = sizeof(struct ecryptfs_file_info), }, - { - .cache = &ecryptfs_dentry_info_cache, - .name = "ecryptfs_dentry_info_cache", - .size = sizeof(struct ecryptfs_dentry_info), - }, { .cache = &ecryptfs_inode_info_cache, .name = "ecryptfs_inode_cache", From 19ac81735c9bcb0c93895e3674792b923ac16956 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2025 03:11:30 -0400 Subject: [PATCH 53/64] fs/namespace.c: sanitize descriptions for {__,}lookup_mnt() Comments regarding "shadow mounts" were stale - no such thing anymore. Document the locking requirements for __lookup_mnt(). Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 41 ++++++++++++----------------------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ae29cbf13e39..7489c9c716c6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -825,24 +825,16 @@ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) } /** - * __lookup_mnt - find first child mount + * __lookup_mnt - mount hash lookup * @mnt: parent mount - * @dentry: mountpoint + * @dentry: dentry of mountpoint * - * If @mnt has a child mount @c mounted @dentry find and return it. + * If @mnt has a child mount @c mounted on @dentry find and return it. + * Caller must either hold the spinlock component of @mount_lock or + * hold rcu_read_lock(), sample the seqcount component before the call + * and recheck it afterwards. * - * Note that the child mount @c need not be unique. There are cases - * where shadow mounts are created. For example, during mount - * propagation when a source mount @mnt whose root got overmounted by a - * mount @o after path lookup but before @namespace_sem could be - * acquired gets copied and propagated. So @mnt gets copied including - * @o. When @mnt is propagated to a destination mount @d that already - * has another mount @n mounted at the same mountpoint then the source - * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on - * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt - * on @dentry. - * - * Return: The first child of @mnt mounted @dentry or NULL. + * Return: The child of @mnt mounted on @dentry or %NULL. */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { @@ -855,21 +847,12 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) return NULL; } -/* - * lookup_mnt - Return the first child mount mounted at path +/** + * lookup_mnt - Return the child mount mounted at given location + * @path: location in the namespace * - * "First" means first mounted chronologically. If you create the - * following mounts: - * - * mount /dev/sda1 /mnt - * mount /dev/sda2 /mnt - * mount /dev/sda3 /mnt - * - * Then lookup_mnt() on the base /mnt dentry in the root mount will - * return successively the root dentry and vfsmount of /dev/sda1, then - * /dev/sda2, then /dev/sda3, then NULL. - * - * lookup_mnt takes a reference to the found vfsmount. + * Acquires and returns a new reference to mount at given location + * or %NULL if nothing is mounted there. */ struct vfsmount *lookup_mnt(const struct path *path) { From 1a22542b5ffe2f91859ed7c7e610dc7b588a6713 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 2 Sep 2025 19:55:41 -0400 Subject: [PATCH 54/64] path_has_submounts(): use guard(mount_locked_reader) Needed there since the callback passed to d_walk() (path_check_mount()) is using __path_is_mountpoint(), which uses __lookup_mnt(). Has to be taken in the caller - d_walk() might take rename_lock spinlock component and that nests inside mount_lock. Signed-off-by: Al Viro --- fs/dcache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 60046ae23d51..ab21a8402db0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1390,6 +1390,7 @@ struct check_mount { unsigned int mounted; }; +/* locks: mount_locked_reader && dentry->d_lock */ static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { struct check_mount *info = data; @@ -1416,9 +1417,8 @@ int path_has_submounts(const struct path *parent) { struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; - read_seqlock_excl(&mount_lock); + guard(mount_locked_reader)(); d_walk(parent->dentry, &data, path_check_mount); - read_sequnlock_excl(&mount_lock); return data.mounted; } From 71cf10ce4562ed7b18f3bd44a4e2dfafd0e84c50 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Aug 2025 16:53:42 -0400 Subject: [PATCH 55/64] open_detached_copy(): don't bother with mount_lock_hash() we are holding namespace_sem and a reference to root of tree; iterating through that tree does not need mount_lock. Neither does the insertion into the rbtree of new namespace or incrementing the mount count of that namespace. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 7489c9c716c6..468ae0cb475e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3080,14 +3080,12 @@ static struct file *open_detached_copy(struct path *path, bool recursive) return ERR_CAST(mnt); } - lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { mnt_add_to_ns(ns, p); ns->nr_mounts++; } ns->root = mnt; mntget(&mnt->mnt); - unlock_mount_hash(); namespace_unlock(); mntput(path->mnt); From 57a7b5b0b6d9b92871bffcc21865ec07d5c8b297 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Aug 2025 16:59:16 -0400 Subject: [PATCH 56/64] open_detached_copy(): separate creation of namespace into helper ... and convert the helper to use of a guard(namespace_excl) Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 468ae0cb475e..4ebd9cc6f6c6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3047,18 +3047,17 @@ static int do_loopback(const struct path *path, const char *old_name, return err; } -static struct file *open_detached_copy(struct path *path, bool recursive) +static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive) { struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; struct user_namespace *user_ns = mnt_ns->user_ns; struct mount *mnt, *p; - struct file *file; ns = alloc_mnt_ns(user_ns, true); if (IS_ERR(ns)) - return ERR_CAST(ns); + return ns; - namespace_lock(); + guard(namespace_excl)(); /* * Record the sequence number of the source mount namespace. @@ -3075,8 +3074,7 @@ static struct file *open_detached_copy(struct path *path, bool recursive) mnt = __do_loopback(path, recursive); if (IS_ERR(mnt)) { - namespace_unlock(); - free_mnt_ns(ns); + emptied_ns = ns; return ERR_CAST(mnt); } @@ -3085,11 +3083,19 @@ static struct file *open_detached_copy(struct path *path, bool recursive) ns->nr_mounts++; } ns->root = mnt; - mntget(&mnt->mnt); - namespace_unlock(); + return ns; +} + +static struct file *open_detached_copy(struct path *path, bool recursive) +{ + struct mnt_namespace *ns = get_detached_copy(path, recursive); + struct file *file; + + if (IS_ERR(ns)) + return ERR_CAST(ns); mntput(path->mnt); - path->mnt = &mnt->mnt; + path->mnt = mntget(&ns->root->mnt); file = dentry_open(path, O_PATH, current_cred()); if (IS_ERR(file)) dissolve_on_fput(path->mnt); From 7bb4c851dcb7a4ec0b4ba7fcf3f451da0894969d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Aug 2025 17:04:44 -0400 Subject: [PATCH 57/64] copy_mnt_ns(): use the regular mechanism for freeing empty mnt_ns on failure Now that free_mnt_ns() works prior to mnt_ns_tree_add(), there's no need for an open-coded analogue free_mnt_ns() there - yes, we do avoid one call_rcu() use per failing call of clone() or unshare(), if they fail due to OOM in that particular spot, but it's not really worth bothering. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 5968c44cc38a..b4374d6d4bae 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4184,10 +4184,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, copy_flags |= CL_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { + emptied_ns = new_ns; namespace_unlock(); - ns_free_inum(&new_ns->ns); - dec_mnt_namespaces(new_ns->ucounts); - mnt_ns_release(new_ns); return ERR_CAST(new); } if (user_ns != ns->user_ns) { From d7b7253a0adc6e24869ef74a2085767cb11eb6fc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Aug 2025 17:12:15 -0400 Subject: [PATCH 58/64] copy_mnt_ns(): use guards * mntput() of rootmnt and pwdmnt done via __free(mntput) * mnt_ns_tree_add() can be done within namespace_excl scope. Signed-off-by: Al Viro --- fs/namespace.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index b4374d6d4bae..b5a082fae006 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4158,7 +4158,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; - struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; + struct vfsmount *rootmnt __free(mntput) = NULL; + struct vfsmount *pwdmnt __free(mntput) = NULL; struct mount *p, *q; struct mount *old; struct mount *new; @@ -4177,7 +4178,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (IS_ERR(new_ns)) return new_ns; - namespace_lock(); + guard(namespace_excl)(); /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) @@ -4185,13 +4186,11 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { emptied_ns = new_ns; - namespace_unlock(); return ERR_CAST(new); } if (user_ns != ns->user_ns) { - lock_mount_hash(); + guard(mount_writer)(); lock_mnt_tree(new); - unlock_mount_hash(); } new_ns->root = new; @@ -4223,13 +4222,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } - namespace_unlock(); - - if (rootmnt) - mntput(rootmnt); - if (pwdmnt) - mntput(pwdmnt); - mnt_ns_tree_add(new_ns); return new_ns; } From 7f954a6f491088f18a6c7c975da8ddc8c003e518 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Aug 2025 13:46:33 -0400 Subject: [PATCH 59/64] simplify the callers of mnt_unhold_writers() The logics in cleanup on failure in mount_setattr_prepare() is simplified by having the mnt_hold_writers() failure followed by advancing m to the next node in the tree before leaving the loop. And since all calls are preceded by the same check that flag has been set and the function is inlined, let's just shift the check into it. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index b5a082fae006..c628a323281b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -714,13 +714,14 @@ static inline int mnt_hold_writers(struct mount *mnt) * Stop preventing write access to @mnt allowing callers to gain write access * to @mnt again. * - * This function can only be called after a successful call to - * mnt_hold_writers(). + * This function can only be called after a call to mnt_hold_writers(). * * Context: This function expects lock_mount_hash() to be held. */ static inline void mnt_unhold_writers(struct mount *mnt) { + if (!(mnt->mnt_flags & MNT_WRITE_HOLD)) + return; /* * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. @@ -4768,8 +4769,10 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) if (!mnt_allow_writers(kattr, m)) { err = mnt_hold_writers(m); - if (err) + if (err) { + m = next_mnt(m, mnt); break; + } } if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) @@ -4777,25 +4780,9 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) } if (err) { - struct mount *p; - - /* - * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will - * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all - * mounts and needs to take care to include the first mount. - */ - for (p = mnt; p; p = next_mnt(p, mnt)) { - /* If we had to hold writers unblock them. */ - if (p->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt_unhold_writers(p); - - /* - * We're done once the first mount we changed got - * MNT_WRITE_HOLD unset. - */ - if (p == m) - break; - } + /* undo all mnt_hold_writers() we'd done */ + for (struct mount *p = mnt; p != m; p = next_mnt(p, mnt)) + mnt_unhold_writers(p); } return err; } @@ -4826,8 +4813,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) WRITE_ONCE(m->mnt.mnt_flags, flags); /* If we had to hold writers unblock them. */ - if (m->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt_unhold_writers(m); + mnt_unhold_writers(m); if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); From 5d132cfafb6a86740e65177c79fd5d7b02613383 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 5 Jul 2025 00:38:09 -0400 Subject: [PATCH 60/64] setup_mnt(): primitive for connecting a mount to filesystem Take the identical logics in vfs_create_mount() and clone_mnt() into a new helper that takes an empty struct mount and attaches it to given dentry (sub)tree. Should be called once in the lifetime of every mount, prior to making it visible in any data structures. After that point ->mnt_root and ->mnt_sb never change; ->mnt_root is a counting reference to dentry and ->mnt_sb - an active reference to superblock. Mount remains associated with that dentry tree all the way until the call of cleanup_mnt(), when the refcount eventually drops to zero. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index c628a323281b..4e9ed4edd854 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1196,6 +1196,21 @@ static void commit_tree(struct mount *mnt) touch_mnt_namespace(n); } +static void setup_mnt(struct mount *m, struct dentry *root) +{ + struct super_block *s = root->d_sb; + + atomic_inc(&s->s_active); + m->mnt.mnt_sb = s; + m->mnt.mnt_root = dget(root); + m->mnt_mountpoint = m->mnt.mnt_root; + m->mnt_parent = m; + + lock_mount_hash(); + list_add_tail(&m->mnt_instance, &s->s_mounts); + unlock_mount_hash(); +} + /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached @@ -1219,15 +1234,8 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; - atomic_inc(&fc->root->d_sb->s_active); - mnt->mnt.mnt_sb = fc->root->d_sb; - mnt->mnt.mnt_root = dget(fc->root); - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; + setup_mnt(mnt, fc->root); - lock_mount_hash(); - list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); - unlock_mount_hash(); return &mnt->mnt; } EXPORT_SYMBOL(vfs_create_mount); @@ -1285,7 +1293,6 @@ EXPORT_SYMBOL_GPL(vfs_kern_mount); static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { - struct super_block *sb = old->mnt.mnt_sb; struct mount *mnt; int err; @@ -1310,16 +1317,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (mnt->mnt_group_id) set_mnt_shared(mnt); - atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); - mnt->mnt.mnt_sb = sb; - mnt->mnt.mnt_root = dget(root); - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; - lock_mount_hash(); - list_add_tail(&mnt->mnt_instance, &sb->s_mounts); - unlock_mount_hash(); + setup_mnt(mnt, root); if (flag & CL_PRIVATE) // we are done with it return mnt; From 09a1b33c080f6ac700fadc67c8471e67bf75fda4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Aug 2025 12:33:11 -0400 Subject: [PATCH 61/64] preparations to taking MNT_WRITE_HOLD out of ->mnt_flags We have an unpleasant wart in accessibility rules for struct mount. There are per-superblock lists of mounts, used by sb_prepare_remount_readonly() to check if any of those is currently claimed for write access and to block further attempts to get write access on those until we are done. As soon as it is attached to a filesystem, mount becomes reachable via that list. Only sb_prepare_remount_readonly() traverses it and it only accesses a few members of struct mount. Unfortunately, ->mnt_flags is one of those and it is modified - MNT_WRITE_HOLD set and then cleared. It is done under mount_lock, so from the locking rules POV everything's fine. However, it has easily overlooked implications - once mount has been attached to a filesystem, it has to be treated as globally visible. In particular, initializing ->mnt_flags *must* be done either prior to that point or under mount_lock. All other members are still private at that point. Life gets simpler if we move that bit (and that's *all* that can get touched by access via this list) out of ->mnt_flags. It's not even hard to do - currently the list is implemented as list_head one, anchored in super_block->s_mounts and linked via mount->mnt_instance. As the first step, switch it to hlist-like open-coded structure - address of the first mount in the set is stored in ->s_mounts and ->mnt_instance replaced with ->mnt_next_for_sb and ->mnt_pprev_for_sb - the former either NULL or pointing to the next mount in set, the latter - address of either ->s_mounts or ->mnt_next_for_sb in the previous element of the set. In the next commit we'll steal the LSB of ->mnt_pprev_for_sb as replacement for MNT_WRITE_HOLD. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/mount.h | 4 +++- fs/namespace.c | 38 +++++++++++++++++++++++++++++--------- fs/super.c | 3 +-- include/linux/fs.h | 4 +++- 4 files changed, 36 insertions(+), 13 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 04d0eadc4c10..b208f69f69d7 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -64,7 +64,9 @@ struct mount { #endif struct list_head mnt_mounts; /* list of children, anchored here */ struct list_head mnt_child; /* and going through their mnt_child */ - struct list_head mnt_instance; /* mount instance on sb->s_mounts */ + struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */ + struct mount * __aligned(1) *mnt_pprev_for_sb; + /* except that LSB of pprev will be stolen */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ diff --git a/fs/namespace.c b/fs/namespace.c index 4e9ed4edd854..342dfd882b13 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -730,6 +730,27 @@ static inline void mnt_unhold_writers(struct mount *mnt) mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } +static inline void mnt_del_instance(struct mount *m) +{ + struct mount **p = m->mnt_pprev_for_sb; + struct mount *next = m->mnt_next_for_sb; + + if (next) + next->mnt_pprev_for_sb = p; + *p = next; +} + +static inline void mnt_add_instance(struct mount *m, struct super_block *s) +{ + struct mount *first = s->s_mounts; + + if (first) + first->mnt_pprev_for_sb = &m->mnt_next_for_sb; + m->mnt_next_for_sb = first; + m->mnt_pprev_for_sb = &s->s_mounts; + s->s_mounts = m; +} + static int mnt_make_readonly(struct mount *mnt) { int ret; @@ -743,7 +764,6 @@ static int mnt_make_readonly(struct mount *mnt) int sb_prepare_remount_readonly(struct super_block *sb) { - struct mount *mnt; int err = 0; /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ @@ -751,9 +771,9 @@ int sb_prepare_remount_readonly(struct super_block *sb) return -EBUSY; lock_mount_hash(); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { - err = mnt_hold_writers(mnt); + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (!(m->mnt.mnt_flags & MNT_READONLY)) { + err = mnt_hold_writers(m); if (err) break; } @@ -763,9 +783,9 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err) sb_start_ro_state_change(sb); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (m->mnt.mnt_flags & MNT_WRITE_HOLD) + m->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } unlock_mount_hash(); @@ -1207,7 +1227,7 @@ static void setup_mnt(struct mount *m, struct dentry *root) m->mnt_parent = m; lock_mount_hash(); - list_add_tail(&m->mnt_instance, &s->s_mounts); + mnt_add_instance(m, s); unlock_mount_hash(); } @@ -1425,7 +1445,7 @@ static void mntput_no_expire(struct mount *mnt) mnt->mnt.mnt_flags |= MNT_DOOMED; rcu_read_unlock(); - list_del(&mnt->mnt_instance); + mnt_del_instance(mnt); if (unlikely(!list_empty(&mnt->mnt_expire))) list_del(&mnt->mnt_expire); diff --git a/fs/super.c b/fs/super.c index 7f876f32343a..3b0f49e1b817 100644 --- a/fs/super.c +++ b/fs/super.c @@ -323,7 +323,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, if (!s) return NULL; - INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -408,7 +407,7 @@ static void __put_super(struct super_block *s) list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); - WARN_ON(!list_empty(&s->s_mounts)); + WARN_ON(s->s_mounts); call_rcu(&s->rcu, destroy_super_rcu); } } diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..0e9c7f1460dc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1324,6 +1324,8 @@ struct sb_writers { struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; +struct mount; + struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ @@ -1358,7 +1360,7 @@ struct super_block { __u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ - struct list_head s_mounts; /* list of mounts; _not_ for fs use */ + struct mount *s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ struct file *s_bdev_file; struct backing_dev_info *s_bdi; From 3371fa2f27134fc4ec7d40b2ae7b9e92c3b2527e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Aug 2025 13:37:12 -0400 Subject: [PATCH 62/64] struct mount: relocate MNT_WRITE_HOLD bit ... from ->mnt_flags to LSB of ->mnt_pprev_for_sb. This is safe - we always set and clear it within the same mount_lock scope, so we won't interfere with list operations - traversals are always forward, so they don't even look at ->mnt_prev_for_sb and both insertions and removals are in mount_lock scopes of their own, so that bit will be clear in *all* mount instances during those. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/mount.h | 25 ++++++++++++++++++++++++- fs/namespace.c | 34 +++++++++++++++++----------------- include/linux/mount.h | 3 +-- 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index b208f69f69d7..40cf16544317 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -66,7 +66,8 @@ struct mount { struct list_head mnt_child; /* and going through their mnt_child */ struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */ struct mount * __aligned(1) *mnt_pprev_for_sb; - /* except that LSB of pprev will be stolen */ + /* except that LSB of pprev is stolen */ +#define WRITE_HOLD 1 /* ... for use by mnt_hold_writers() */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ @@ -244,4 +245,26 @@ static inline struct mount *topmost_overmount(struct mount *m) return m; } +static inline bool __test_write_hold(struct mount * __aligned(1) *val) +{ + return (unsigned long)val & WRITE_HOLD; +} + +static inline bool test_write_hold(const struct mount *m) +{ + return __test_write_hold(m->mnt_pprev_for_sb); +} + +static inline void set_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + | WRITE_HOLD); +} + +static inline void clear_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + & ~WRITE_HOLD); +} + struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); diff --git a/fs/namespace.c b/fs/namespace.c index 342dfd882b13..714e159ed9cd 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -509,20 +509,20 @@ int mnt_get_write_access(struct vfsmount *m) mnt_inc_writers(mnt); /* * The store to mnt_inc_writers must be visible before we pass - * MNT_WRITE_HOLD loop below, so that the slowpath can see our - * incremented count after it has set MNT_WRITE_HOLD. + * WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set WRITE_HOLD. */ smp_mb(); might_lock(&mount_lock.lock); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { cpu_relax(); } else { /* * This prevents priority inversion, if the task - * setting MNT_WRITE_HOLD got preempted on a remote + * setting WRITE_HOLD got preempted on a remote * CPU, and it prevents life lock if the task setting - * MNT_WRITE_HOLD has a lower priority and is bound to + * WRITE_HOLD has a lower priority and is bound to * the same CPU as the task that is spinning here. */ preempt_enable(); @@ -533,7 +533,7 @@ int mnt_get_write_access(struct vfsmount *m) } /* * The barrier pairs with the barrier sb_start_ro_state_change() making - * sure that if we see MNT_WRITE_HOLD cleared, we will also see + * sure that if we see WRITE_HOLD cleared, we will also see * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in * mnt_is_readonly() and bail in case we are racing with remount * read-only. @@ -672,15 +672,15 @@ EXPORT_SYMBOL(mnt_drop_write_file); * @mnt. * * Context: This function expects lock_mount_hash() to be held serializing - * setting MNT_WRITE_HOLD. + * setting WRITE_HOLD. * Return: On success 0 is returned. * On error, -EBUSY is returned. */ static inline int mnt_hold_writers(struct mount *mnt) { - mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + set_write_hold(mnt); /* - * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * After storing WRITE_HOLD, we'll read the counters. This store * should be visible before we do. */ smp_mb(); @@ -696,9 +696,9 @@ static inline int mnt_hold_writers(struct mount *mnt) * sum up each counter, if we read a counter before it is incremented, * but then read another CPU's count which it has been subsequently * decremented from -- we would see more decrements than we should. - * MNT_WRITE_HOLD protects against this scenario, because + * WRITE_HOLD protects against this scenario, because * mnt_want_write first increments count, then smp_mb, then spins on - * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ if (mnt_get_writers(mnt) > 0) @@ -720,14 +720,14 @@ static inline int mnt_hold_writers(struct mount *mnt) */ static inline void mnt_unhold_writers(struct mount *mnt) { - if (!(mnt->mnt_flags & MNT_WRITE_HOLD)) + if (!test_write_hold(mnt)) return; /* - * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * MNT_READONLY must become visible before ~WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + clear_write_hold(mnt); } static inline void mnt_del_instance(struct mount *m) @@ -766,7 +766,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) { int err = 0; - /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ + /* Racy optimization. Recheck the counter under WRITE_HOLD */ if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; @@ -784,8 +784,8 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err) sb_start_ro_state_change(sb); for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { - if (m->mnt.mnt_flags & MNT_WRITE_HOLD) - m->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + if (test_write_hold(m)) + clear_write_hold(m); } unlock_mount_hash(); diff --git a/include/linux/mount.h b/include/linux/mount.h index 18e4b97f8a98..85e97b9340ff 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -33,7 +33,6 @@ enum mount_flags { MNT_NOSYMFOLLOW = 0x80, MNT_SHRINKABLE = 0x100, - MNT_WRITE_HOLD = 0x200, MNT_INTERNAL = 0x4000, @@ -52,7 +51,7 @@ enum mount_flags { | MNT_READONLY | MNT_NOSYMFOLLOW, MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME, - MNT_INTERNAL_FLAGS = MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | + MNT_INTERNAL_FLAGS = MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_LOCKED }; From 1e414adf03ae5a9928aad9044a07adeb92ddcd2c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Aug 2025 14:04:31 -0400 Subject: [PATCH 63/64] WRITE_HOLD machinery: no need for to bump mount_lock seqcount ... neither for insertion into the list of instances, nor for mnt_{un,}hold_writers(), nor for mnt_get_write_access() deciding to be nice to RT during a busy-wait loop - all of that only needs the spinlock side of mount_lock. IOW, it's mount_locked_reader, not mount_writer. Clarify the comment re locking rules for mnt_unhold_writers() - it's not just that mount_lock needs to be held when calling that, it must have been held all along since the matching mnt_hold_writers(). Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 714e159ed9cd..54066c9b8da0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -526,8 +526,8 @@ int mnt_get_write_access(struct vfsmount *m) * the same CPU as the task that is spinning here. */ preempt_enable(); - lock_mount_hash(); - unlock_mount_hash(); + read_seqlock_excl(&mount_lock); + read_sequnlock_excl(&mount_lock); preempt_disable(); } } @@ -671,7 +671,7 @@ EXPORT_SYMBOL(mnt_drop_write_file); * a call to mnt_unhold_writers() in order to stop preventing write access to * @mnt. * - * Context: This function expects lock_mount_hash() to be held serializing + * Context: This function expects to be in mount_locked_reader scope serializing * setting WRITE_HOLD. * Return: On success 0 is returned. * On error, -EBUSY is returned. @@ -716,7 +716,8 @@ static inline int mnt_hold_writers(struct mount *mnt) * * This function can only be called after a call to mnt_hold_writers(). * - * Context: This function expects lock_mount_hash() to be held. + * Context: This function expects to be in the same mount_locked_reader scope + * as the matching mnt_hold_writers(). */ static inline void mnt_unhold_writers(struct mount *mnt) { @@ -770,7 +771,8 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; - lock_mount_hash(); + guard(mount_locked_reader)(); + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { if (!(m->mnt.mnt_flags & MNT_READONLY)) { err = mnt_hold_writers(m); @@ -787,7 +789,6 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (test_write_hold(m)) clear_write_hold(m); } - unlock_mount_hash(); return err; } @@ -1226,9 +1227,8 @@ static void setup_mnt(struct mount *m, struct dentry *root) m->mnt_mountpoint = m->mnt.mnt_root; m->mnt_parent = m; - lock_mount_hash(); + guard(mount_locked_reader)(); mnt_add_instance(m, s); - unlock_mount_hash(); } /** From a79765248649de77771c24f7be08ff4c96f16f7a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 30 Aug 2025 02:48:13 -0400 Subject: [PATCH 64/64] constify {__,}mnt_is_readonly() Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- include/linux/mount.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 54066c9b8da0..4b74fabced43 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -428,7 +428,7 @@ static struct mount *alloc_vfsmnt(const char *name) * mnt_want/drop_write() will _keep_ the filesystem * r/w. */ -bool __mnt_is_readonly(struct vfsmount *mnt) +bool __mnt_is_readonly(const struct vfsmount *mnt) { return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); } @@ -468,7 +468,7 @@ static unsigned int mnt_get_writers(struct mount *mnt) #endif } -static int mnt_is_readonly(struct vfsmount *mnt) +static int mnt_is_readonly(const struct vfsmount *mnt) { if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) return 1; diff --git a/include/linux/mount.h b/include/linux/mount.h index 85e97b9340ff..acfe7ef86a1b 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -76,7 +76,7 @@ extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); extern void mnt_make_shortterm(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(const struct path *path); -extern bool __mnt_is_readonly(struct vfsmount *mnt); +extern bool __mnt_is_readonly(const struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); extern struct vfsmount *clone_private_mount(const struct path *path);