From 0b7747a5477eb22d041997bc085fa8d492fa9b96 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 23 Mar 2025 18:19:55 +0100 Subject: [PATCH 01/10] pidfs: cleanup the usage of do_notify_pidfd() If a single-threaded process exits do_notify_pidfd() will be called twice, from exit_notify() and right after that from do_notify_parent(). 1. Change exit_notify() to call do_notify_pidfd() if the exiting task is not ptraced and it is not a group leader. 2. Change do_notify_parent() to call do_notify_pidfd() unconditionally. If tsk is not ptraced, do_notify_parent() will only be called when it is a group-leader and thread_group_empty() is true. This means that if tsk is ptraced, do_notify_pidfd() will be called from do_notify_parent() even if tsk is a delay_group_leader(). But this case is less common, and apart from the unnecessary __wake_up() is harmless. Granted, this unnecessary __wake_up() can be avoided, but I don't want to do it in this patch because it's just a consequence of another historical oddity: we notify the tracer even if !thread_group_empty(), but do_wait() from debugger can't work until all other threads exit. With or without this patch we should either eliminate do_notify_parent() in this case, or change do_wait(WEXITED) to untrace the ptraced delay_group_leader() at least when ptrace_reparented(). Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250323171955.GA834@redhat.com Signed-off-by: Christian Brauner --- kernel/exit.c | 8 ++------ kernel/signal.c | 8 +++----- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index c2e6c7b7779f..5d1226fdfadc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -756,12 +756,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; - /* - * Ignore thread-group leaders that exited before all - * subthreads did. - */ - if (!delay_group_leader(tsk)) - do_notify_pidfd(tsk); if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && @@ -774,6 +768,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead) do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; + /* untraced sub-thread */ + do_notify_pidfd(tsk); } if (autoreap) { diff --git a/kernel/signal.c b/kernel/signal.c index 027ad9e97417..1d8db0dabb71 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2179,11 +2179,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); - /* - * Notify for thread-group leaders without subthreads. - */ - if (thread_group_empty(tsk)) - do_notify_pidfd(tsk); + + /* ptraced, or group-leader without sub-threads */ + do_notify_pidfd(tsk); if (sig != SIGCHLD) { /* From 8661bb9c717a07b7636224339fe8818b65db6ddf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 23 Mar 2025 18:45:18 +0100 Subject: [PATCH 02/10] selftests/pidfd: fixes syscall number defines I had to spend some (a lot;) time to understand why pidfd_info_test (and more) fails with my patch under qemu on my machine ;) Until I applied the patch below. I think it is a bad idea to do the things like #ifndef __NR_clone3 #define __NR_clone3 -1 #endif because this can hide a problem. My working laptop runs Fedora-23 which doesn't have __NR_clone3/etc in /usr/include/. So "make" happily succeeds, but everything fails and it is not clear why. Link: https://lore.kernel.org/r/20250323174518.GB834@redhat.com Signed-off-by: Christian Brauner --- tools/testing/selftests/clone3/clone3_selftests.h | 2 +- tools/testing/selftests/pidfd/pidfd.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h index 3d2663fe50ba..eeca8005723f 100644 --- a/tools/testing/selftests/clone3/clone3_selftests.h +++ b/tools/testing/selftests/clone3/clone3_selftests.h @@ -16,7 +16,7 @@ #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr))) #ifndef __NR_clone3 -#define __NR_clone3 -1 +#define __NR_clone3 435 #endif struct __clone_args { diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index cec22aa11cdf..55bcf81a2b9a 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -32,19 +32,19 @@ #endif #ifndef __NR_pidfd_open -#define __NR_pidfd_open -1 +#define __NR_pidfd_open 434 #endif #ifndef __NR_pidfd_send_signal -#define __NR_pidfd_send_signal -1 +#define __NR_pidfd_send_signal 424 #endif #ifndef __NR_clone3 -#define __NR_clone3 -1 +#define __NR_clone3 435 #endif #ifndef __NR_pidfd_getfd -#define __NR_pidfd_getfd -1 +#define __NR_pidfd_getfd 438 #endif #ifndef PIDFD_NONBLOCK From af7bb0d2ca459f15cb5ca604dab5d9af103643f0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 24 Mar 2025 17:00:03 +0100 Subject: [PATCH 03/10] exec: fix the racy usage of fs_struct->in_exec check_unsafe_exec() sets fs->in_exec under cred_guard_mutex, then execve() paths clear fs->in_exec lockless. This is fine if exec succeeds, but if it fails we have the following race: T1 sets fs->in_exec = 1, fails, drops cred_guard_mutex T2 sets fs->in_exec = 1 T1 clears fs->in_exec T2 continues with fs->in_exec == 0 Change fs/exec.c to clear fs->in_exec with cred_guard_mutex held. Reported-by: syzbot+1c486d0b62032c82a968@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67dc67f0.050a0220.25ae54.001f.GAE@google.com/ Cc: stable@vger.kernel.org Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250324160003.GA8878@redhat.com Signed-off-by: Christian Brauner --- fs/exec.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index f45859ad13ac..5d1c0d2dc403 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1227,13 +1227,12 @@ int begin_new_exec(struct linux_binprm * bprm) */ bprm->point_of_no_return = true; - /* - * Make this the only thread in the thread group. - */ + /* Make this the only thread in the thread group */ retval = de_thread(me); if (retval) goto out; - + /* see the comment in check_unsafe_exec() */ + current->fs->in_exec = 0; /* * Cancel any io_uring activity across execve */ @@ -1495,6 +1494,8 @@ static void free_bprm(struct linux_binprm *bprm) } free_arg_pages(bprm); if (bprm->cred) { + /* in case exec fails before de_thread() succeeds */ + current->fs->in_exec = 0; mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } @@ -1616,6 +1617,10 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... + * + * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS) + * from another sub-thread until de_thread() succeeds, this + * state is protected by cred_guard_mutex we hold. */ n_fs = 1; spin_lock(&p->fs->lock); @@ -1860,7 +1865,6 @@ static int bprm_execve(struct linux_binprm *bprm) sched_mm_cid_after_execve(current); /* execve succeeded */ - current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); user_events_execve(current); @@ -1879,7 +1883,6 @@ static int bprm_execve(struct linux_binprm *bprm) force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); - current->fs->in_exec = 0; current->in_execve = 0; return retval; From 406fad7698f5bf21ab6b5ca195bf4b9e0b3990ed Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Tue, 25 Mar 2025 09:59:05 -0300 Subject: [PATCH 04/10] cachefiles: Fix oops in vfs_mkdir from cachefiles_get_directory Commit c54b386969a5 ("VFS: Change vfs_mkdir() to return the dentry.") changed cachefiles_get_directory, replacing "subdir" with a ERR_PTR from the result of cachefiles_inject_write_error, which is either 0 or some error code. This causes an oops when the resulting pointer is passed to vfs_mkdir. Use a similar pattern to what is used earlier in the function; replace subdir with either the return value from vfs_mkdir, or the ERR_PTR of the cachefiles_inject_write_error() return value, but only if it is non zero. Fixes: c54b386969a5 ("VFS: Change vfs_mkdir() to return the dentry.") cc: netfs@lists.linux.dev Signed-off-by: Marc Dionne Link: https://lore.kernel.org/r/20250325125905.395372-1-marc.dionne@auristor.com Signed-off-by: Christian Brauner --- fs/cachefiles/namei.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 83a60126de0f..14d0cc894000 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -128,10 +128,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, ret = security_path_mkdir(&path, subdir, 0700); if (ret < 0) goto mkdir_error; - subdir = ERR_PTR(cachefiles_inject_write_error()); - if (!IS_ERR(subdir)) + ret = cachefiles_inject_write_error(); + if (ret == 0) subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); - ret = PTR_ERR(subdir); + else + subdir = ERR_PTR(ret); if (IS_ERR(subdir)) { trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, cachefiles_trace_mkdir_error); From 1243045c9448cd3f29e9d075de58dc81a0c2c3d9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 24 Mar 2025 16:11:03 -0400 Subject: [PATCH 05/10] netfs: add Paulo as maintainer and remove myself as Reviewer My role has changed since I originally agreed to help with netfs, and I'm no longer providing a lot of value here. Luckily, Paulo has agreed to step in as co-maintainer. Acked-by: Paulo Alcantara Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20250324-master-v1-1-e2dd2fdb15b4@kernel.org Signed-off-by: Christian Brauner --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3b5fa8436987..e79c8a1c58d8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8949,7 +8949,7 @@ F: include/linux/iomap.h FILESYSTEMS [NETFS LIBRARY] M: David Howells -R: Jeff Layton +M: Paulo Alcantara L: netfs@lists.linux.dev L: linux-fsdevel@vger.kernel.org S: Supported From 9133607de37a4887c6f89ed937176a0a0c1ebb17 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 24 Mar 2025 18:19:41 +0100 Subject: [PATCH 06/10] exit: fix the usage of delay_group_leader->exit_code in do_notify_parent() and pidfs_exit() Consider a process with a group leader L and a sub-thread T. L does sys_exit(1), then T does sys_exit_group(2). In this case wait_task_zombie(L) will notice SIGNAL_GROUP_EXIT and use L->signal->group_exit_code, this is correct. But, before that, do_notify_parent(L) called by release_task(T) will use L->exit_code != L->signal->group_exit_code, and this is not consistent. We don't really care, I think that nobody relies on the info which comes with SIGCHLD, if nothing else SIGCHLD < SIGRTMIN can be queued only once. But pidfs_exit() is more problematic, I think pidfs_exit_info->exit_code should report ->group_exit_code in this case, just like wait_task_zombie(). TODO: with this change we can hopefully cleanup (or may be even kill) the similar SIGNAL_GROUP_EXIT checks, at least in wait_task_zombie(). Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250324171941.GA13114@redhat.com Signed-off-by: Christian Brauner --- kernel/exit.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/exit.c b/kernel/exit.c index 5d1226fdfadc..1b51dc099f1e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -268,6 +268,9 @@ void release_task(struct task_struct *p) leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + /* for pidfs_exit() and do_notify_parent() */ + if (leader->signal->flags & SIGNAL_GROUP_EXIT) + leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, From e3206c4aa06fb7c7165b5a4f49cb3d5f35ccc0e9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 24 Mar 2025 18:32:26 +0100 Subject: [PATCH 07/10] exportfs: add module description Every loadable module should have a description, to avoid a warning such as: WARNING: modpost: missing MODULE_DESCRIPTION() in fs/exportfs/exportfs.o Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250324173242.1501003-1-arnd@kernel.org Signed-off-by: Christian Brauner --- fs/exportfs/expfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index b5845c4846b8..128dd092916b 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -608,4 +608,5 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, } EXPORT_SYMBOL_GPL(exportfs_decode_fh); +MODULE_DESCRIPTION("Code mapping from inodes to file handles"); MODULE_LICENSE("GPL"); From 8de544883456d1cd86dc971e21e6e764f393c7d0 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Wed, 26 Mar 2025 17:45:30 +0100 Subject: [PATCH 08/10] MAINTAINERS: configfs: add Andreas Hindborg as maintainer Remove Joel Becker as maintainer of configfs and add Andreas Hindborg as maintainer and Breno Leitao as reviewer. Also update the tree URL. Add an entry for Joel Becker to CREDITS. Acked-by: Breno Leitao Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250326-configfs-maintainer-v1-1-b175189fa27b@kernel.org Signed-off-by: Christian Brauner --- CREDITS | 4 ++++ MAINTAINERS | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CREDITS b/CREDITS index 53d11a46fd69..3ec620f7260b 100644 --- a/CREDITS +++ b/CREDITS @@ -317,6 +317,10 @@ S: Code 930.5, Goddard Space Flight Center S: Greenbelt, Maryland 20771 S: USA +N: Joel Becker +E: jlbec@evilplan.org +D: configfs + N: Adam Belay E: ambx1@neo.rr.com D: Linux Plug and Play Support diff --git a/MAINTAINERS b/MAINTAINERS index e79c8a1c58d8..81215f2ee4d9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5867,9 +5867,10 @@ S: Maintained F: Documentation/security/snp-tdx-threat-model.rst CONFIGFS -M: Joel Becker +M: Andreas Hindborg +R: Breno Leitao S: Supported -T: git git://git.infradead.org/users/hch/configfs.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/a.hindborg/linux.git configfs-next F: fs/configfs/ F: include/linux/configfs.h F: samples/configfs/ From 9e6901f17a719650be376f04d742bdbe1d7094ce Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 26 Mar 2025 18:17:44 -0600 Subject: [PATCH 09/10] fs: namespace: Avoid -Wflex-array-member-not-at-end warning -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Move the conflicting declaration to the end of the structure. Notice that `struct statmount` is a flexible structure --a structure that contains a flexible-array member. Fix the following warning: fs/namespace.c:5329:26: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: "Gustavo A. R. Silva" Link: https://lore.kernel.org/r/Z-SZKNdCiAkVJvqm@kspp Signed-off-by: Christian Brauner --- fs/namespace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 6100e5b962a6..16292ff760c9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5326,8 +5326,10 @@ struct kstatmount { struct mnt_idmap *idmap; u64 mask; struct path root; - struct statmount sm; struct seq_file seq; + + /* Must be last --ends in a flexible-array member. */ + struct statmount sm; }; static u64 mnt_to_attr_flags(struct vfsmount *mnt) From 923936efeb74b3f42e5ad283a0b9110bda102601 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 28 Mar 2025 01:01:19 +0800 Subject: [PATCH 10/10] iomap: Fix conflicting values of iomap flags IOMAP_F_ATOMIC_BIO mistakenly took the same value as of IOMAP_F_SIZE_CHANGED in patch '370a6de7651b ("iomap: rework IOMAP atomic flags")'. Let's fix this and let's also create some more space for filesystem reported flags to avoid this in future. This patch makes the core iomap flags to start from bit 15, moving downwards. Note that "flags" member within struct iomap is of type u16. Fixes: 370a6de7651b ("iomap: rework IOMAP atomic flags") Signed-off-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20250327170119.61045-1-ritesh.list@gmail.com Reviewed-by: John Garry Signed-off-by: Christian Brauner --- include/linux/iomap.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 02fe001feebb..68416b135151 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -78,6 +78,11 @@ struct vm_fault; #define IOMAP_F_ANON_WRITE (1U << 7) #define IOMAP_F_ATOMIC_BIO (1U << 8) +/* + * Flag reserved for file system specific usage + */ +#define IOMAP_F_PRIVATE (1U << 12) + /* * Flags set by the core iomap code during operations: * @@ -88,14 +93,8 @@ struct vm_fault; * range it covers needs to be remapped by the high level before the operation * can proceed. */ -#define IOMAP_F_SIZE_CHANGED (1U << 8) -#define IOMAP_F_STALE (1U << 9) - -/* - * Flags from 0x1000 up are for file system specific usage: - */ -#define IOMAP_F_PRIVATE (1U << 12) - +#define IOMAP_F_SIZE_CHANGED (1U << 14) +#define IOMAP_F_STALE (1U << 15) /* * Magic value for addr: