From 0b7747a5477eb22d041997bc085fa8d492fa9b96 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 23 Mar 2025 18:19:55 +0100
Subject: [PATCH 01/10] pidfs: cleanup the usage of do_notify_pidfd()

If a single-threaded process exits do_notify_pidfd() will be called twice,
from exit_notify() and right after that from do_notify_parent().

1. Change exit_notify() to call do_notify_pidfd() if the exiting task is
   not ptraced and it is not a group leader.

2. Change do_notify_parent() to call do_notify_pidfd() unconditionally.

   If tsk is not ptraced, do_notify_parent() will only be called when it
   is a group-leader and thread_group_empty() is true.

This means that if tsk is ptraced, do_notify_pidfd() will be called from
do_notify_parent() even if tsk is a delay_group_leader(). But this case is
less common, and apart from the unnecessary __wake_up() is harmless.

Granted, this unnecessary __wake_up() can be avoided, but I don't want to
do it in this patch because it's just a consequence of another historical
oddity: we notify the tracer even if !thread_group_empty(), but do_wait()
from debugger can't work until all other threads exit. With or without this
patch we should either eliminate do_notify_parent() in this case, or change
do_wait(WEXITED) to untrace the ptraced delay_group_leader() at least when
ptrace_reparented().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20250323171955.GA834@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/exit.c   | 8 ++------
 kernel/signal.c | 8 +++-----
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index c2e6c7b7779f..5d1226fdfadc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -756,12 +756,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
 
 	tsk->exit_state = EXIT_ZOMBIE;
-	/*
-	 * Ignore thread-group leaders that exited before all
-	 * subthreads did.
-	 */
-	if (!delay_group_leader(tsk))
-		do_notify_pidfd(tsk);
 
 	if (unlikely(tsk->ptrace)) {
 		int sig = thread_group_leader(tsk) &&
@@ -774,6 +768,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 			do_notify_parent(tsk, tsk->exit_signal);
 	} else {
 		autoreap = true;
+		/* untraced sub-thread */
+		do_notify_pidfd(tsk);
 	}
 
 	if (autoreap) {
diff --git a/kernel/signal.c b/kernel/signal.c
index 027ad9e97417..1d8db0dabb71 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2179,11 +2179,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 
 	WARN_ON_ONCE(!tsk->ptrace &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
-	/*
-	 * Notify for thread-group leaders without subthreads.
-	 */
-	if (thread_group_empty(tsk))
-		do_notify_pidfd(tsk);
+
+	/* ptraced, or group-leader without sub-threads */
+	do_notify_pidfd(tsk);
 
 	if (sig != SIGCHLD) {
 		/*

From 8661bb9c717a07b7636224339fe8818b65db6ddf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 23 Mar 2025 18:45:18 +0100
Subject: [PATCH 02/10] selftests/pidfd: fixes syscall number defines

I had to spend some (a lot;) time to understand why pidfd_info_test
(and more) fails with my patch under qemu on my machine ;) Until I
applied the patch below.

I think it is a bad idea to do the things like

	#ifndef __NR_clone3
	#define __NR_clone3 -1
	#endif

because this can hide a problem. My working laptop runs Fedora-23 which
doesn't have __NR_clone3/etc in /usr/include/. So "make" happily succeeds,
but everything fails and it is not clear why.

Link: https://lore.kernel.org/r/20250323174518.GB834@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 tools/testing/selftests/clone3/clone3_selftests.h | 2 +-
 tools/testing/selftests/pidfd/pidfd.h             | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h
index 3d2663fe50ba..eeca8005723f 100644
--- a/tools/testing/selftests/clone3/clone3_selftests.h
+++ b/tools/testing/selftests/clone3/clone3_selftests.h
@@ -16,7 +16,7 @@
 #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
 
 #ifndef __NR_clone3
-#define __NR_clone3 -1
+#define __NR_clone3 435
 #endif
 
 struct __clone_args {
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
index cec22aa11cdf..55bcf81a2b9a 100644
--- a/tools/testing/selftests/pidfd/pidfd.h
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -32,19 +32,19 @@
 #endif
 
 #ifndef __NR_pidfd_open
-#define __NR_pidfd_open -1
+#define __NR_pidfd_open 434
 #endif
 
 #ifndef __NR_pidfd_send_signal
-#define __NR_pidfd_send_signal -1
+#define __NR_pidfd_send_signal 424
 #endif
 
 #ifndef __NR_clone3
-#define __NR_clone3 -1
+#define __NR_clone3 435
 #endif
 
 #ifndef __NR_pidfd_getfd
-#define __NR_pidfd_getfd -1
+#define __NR_pidfd_getfd 438
 #endif
 
 #ifndef PIDFD_NONBLOCK

From af7bb0d2ca459f15cb5ca604dab5d9af103643f0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 24 Mar 2025 17:00:03 +0100
Subject: [PATCH 03/10] exec: fix the racy usage of fs_struct->in_exec

check_unsafe_exec() sets fs->in_exec under cred_guard_mutex, then execve()
paths clear fs->in_exec lockless. This is fine if exec succeeds, but if it
fails we have the following race:

	T1 sets fs->in_exec = 1, fails, drops cred_guard_mutex

	T2 sets fs->in_exec = 1

	T1 clears fs->in_exec

	T2 continues with fs->in_exec == 0

Change fs/exec.c to clear fs->in_exec with cred_guard_mutex held.

Reported-by: syzbot+1c486d0b62032c82a968@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/67dc67f0.050a0220.25ae54.001f.GAE@google.com/
Cc: stable@vger.kernel.org
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20250324160003.GA8878@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/exec.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index f45859ad13ac..5d1c0d2dc403 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1227,13 +1227,12 @@ int begin_new_exec(struct linux_binprm * bprm)
 	 */
 	bprm->point_of_no_return = true;
 
-	/*
-	 * Make this the only thread in the thread group.
-	 */
+	/* Make this the only thread in the thread group */
 	retval = de_thread(me);
 	if (retval)
 		goto out;
-
+	/* see the comment in check_unsafe_exec() */
+	current->fs->in_exec = 0;
 	/*
 	 * Cancel any io_uring activity across execve
 	 */
@@ -1495,6 +1494,8 @@ static void free_bprm(struct linux_binprm *bprm)
 	}
 	free_arg_pages(bprm);
 	if (bprm->cred) {
+		/* in case exec fails before de_thread() succeeds */
+		current->fs->in_exec = 0;
 		mutex_unlock(&current->signal->cred_guard_mutex);
 		abort_creds(bprm->cred);
 	}
@@ -1616,6 +1617,10 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	 * suid exec because the differently privileged task
 	 * will be able to manipulate the current directory, etc.
 	 * It would be nice to force an unshare instead...
+	 *
+	 * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS)
+	 * from another sub-thread until de_thread() succeeds, this
+	 * state is protected by cred_guard_mutex we hold.
 	 */
 	n_fs = 1;
 	spin_lock(&p->fs->lock);
@@ -1860,7 +1865,6 @@ static int bprm_execve(struct linux_binprm *bprm)
 
 	sched_mm_cid_after_execve(current);
 	/* execve succeeded */
-	current->fs->in_exec = 0;
 	current->in_execve = 0;
 	rseq_execve(current);
 	user_events_execve(current);
@@ -1879,7 +1883,6 @@ static int bprm_execve(struct linux_binprm *bprm)
 		force_fatal_sig(SIGSEGV);
 
 	sched_mm_cid_after_execve(current);
-	current->fs->in_exec = 0;
 	current->in_execve = 0;
 
 	return retval;

From 406fad7698f5bf21ab6b5ca195bf4b9e0b3990ed Mon Sep 17 00:00:00 2001
From: Marc Dionne <marc.dionne@auristor.com>
Date: Tue, 25 Mar 2025 09:59:05 -0300
Subject: [PATCH 04/10] cachefiles: Fix oops in vfs_mkdir from
 cachefiles_get_directory

Commit c54b386969a5 ("VFS: Change vfs_mkdir() to return the dentry.")
changed cachefiles_get_directory, replacing "subdir" with a ERR_PTR
from the result of cachefiles_inject_write_error, which is either 0
or some error code.  This causes an oops when the resulting pointer
is passed to vfs_mkdir.

Use a similar pattern to what is used earlier in the function; replace
subdir with either the return value from vfs_mkdir, or the ERR_PTR
of the cachefiles_inject_write_error() return value, but only if it
is non zero.

Fixes: c54b386969a5 ("VFS: Change vfs_mkdir() to return the dentry.")
cc: netfs@lists.linux.dev
Signed-off-by: Marc Dionne <marc.dionne@auristor.com>
Link: https://lore.kernel.org/r/20250325125905.395372-1-marc.dionne@auristor.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/cachefiles/namei.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 83a60126de0f..14d0cc894000 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -128,10 +128,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 		ret = security_path_mkdir(&path, subdir, 0700);
 		if (ret < 0)
 			goto mkdir_error;
-		subdir = ERR_PTR(cachefiles_inject_write_error());
-		if (!IS_ERR(subdir))
+		ret = cachefiles_inject_write_error();
+		if (ret == 0)
 			subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
-		ret = PTR_ERR(subdir);
+		else
+			subdir = ERR_PTR(ret);
 		if (IS_ERR(subdir)) {
 			trace_cachefiles_vfs_error(NULL, d_inode(dir), ret,
 						   cachefiles_trace_mkdir_error);

From 1243045c9448cd3f29e9d075de58dc81a0c2c3d9 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 24 Mar 2025 16:11:03 -0400
Subject: [PATCH 05/10] netfs: add Paulo as maintainer and remove myself as
 Reviewer

My role has changed since I originally agreed to help with netfs, and
I'm no longer providing a lot of value here. Luckily, Paulo has agreed
to step in as co-maintainer.

Acked-by: Paulo Alcantara <pc@manguebit.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20250324-master-v1-1-e2dd2fdb15b4@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 3b5fa8436987..e79c8a1c58d8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8949,7 +8949,7 @@ F:	include/linux/iomap.h
 
 FILESYSTEMS [NETFS LIBRARY]
 M:	David Howells <dhowells@redhat.com>
-R:	Jeff Layton <jlayton@kernel.org>
+M:	Paulo Alcantara <pc@manguebit.com>
 L:	netfs@lists.linux.dev
 L:	linux-fsdevel@vger.kernel.org
 S:	Supported

From 9133607de37a4887c6f89ed937176a0a0c1ebb17 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 24 Mar 2025 18:19:41 +0100
Subject: [PATCH 06/10] exit: fix the usage of delay_group_leader->exit_code in
 do_notify_parent() and pidfs_exit()

Consider a process with a group leader L and a sub-thread T.
L does sys_exit(1), then T does sys_exit_group(2).

In this case wait_task_zombie(L) will notice SIGNAL_GROUP_EXIT and use
L->signal->group_exit_code, this is correct.

But, before that, do_notify_parent(L) called by release_task(T) will use
L->exit_code != L->signal->group_exit_code, and this is not consistent.
We don't really care, I think that nobody relies on the info which comes
with SIGCHLD, if nothing else SIGCHLD < SIGRTMIN can be queued only once.

But pidfs_exit() is more problematic, I think pidfs_exit_info->exit_code
should report ->group_exit_code in this case, just like wait_task_zombie().

TODO: with this change we can hopefully cleanup (or may be even kill) the
similar SIGNAL_GROUP_EXIT checks, at least in wait_task_zombie().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20250324171941.GA13114@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/exit.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/exit.c b/kernel/exit.c
index 5d1226fdfadc..1b51dc099f1e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -268,6 +268,9 @@ void release_task(struct task_struct *p)
 	leader = p->group_leader;
 	if (leader != p && thread_group_empty(leader)
 			&& leader->exit_state == EXIT_ZOMBIE) {
+		/* for pidfs_exit() and do_notify_parent() */
+		if (leader->signal->flags & SIGNAL_GROUP_EXIT)
+			leader->exit_code = leader->signal->group_exit_code;
 		/*
 		 * If we were the last child thread and the leader has
 		 * exited already, and the leader's parent ignores SIGCHLD,

From e3206c4aa06fb7c7165b5a4f49cb3d5f35ccc0e9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 24 Mar 2025 18:32:26 +0100
Subject: [PATCH 07/10] exportfs: add module description

Every loadable module should have a description, to avoid a warning such as:

WARNING: modpost: missing MODULE_DESCRIPTION() in fs/exportfs/exportfs.o

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20250324173242.1501003-1-arnd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/exportfs/expfs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index b5845c4846b8..128dd092916b 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -608,4 +608,5 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(exportfs_decode_fh);
 
+MODULE_DESCRIPTION("Code mapping from inodes to file handles");
 MODULE_LICENSE("GPL");

From 8de544883456d1cd86dc971e21e6e764f393c7d0 Mon Sep 17 00:00:00 2001
From: Andreas Hindborg <a.hindborg@kernel.org>
Date: Wed, 26 Mar 2025 17:45:30 +0100
Subject: [PATCH 08/10] MAINTAINERS: configfs: add Andreas Hindborg as
 maintainer

Remove Joel Becker as maintainer of configfs and add Andreas Hindborg as
maintainer and Breno Leitao as reviewer. Also update the tree URL.

Add an entry for Joel Becker to CREDITS.

Acked-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
Link: https://lore.kernel.org/r/20250326-configfs-maintainer-v1-1-b175189fa27b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 CREDITS     | 4 ++++
 MAINTAINERS | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/CREDITS b/CREDITS
index 53d11a46fd69..3ec620f7260b 100644
--- a/CREDITS
+++ b/CREDITS
@@ -317,6 +317,10 @@ S: Code 930.5, Goddard Space Flight Center
 S: Greenbelt, Maryland 20771
 S: USA
 
+N: Joel Becker
+E: jlbec@evilplan.org
+D: configfs
+
 N: Adam Belay
 E: ambx1@neo.rr.com
 D: Linux Plug and Play Support
diff --git a/MAINTAINERS b/MAINTAINERS
index e79c8a1c58d8..81215f2ee4d9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5867,9 +5867,10 @@ S:	Maintained
 F:	Documentation/security/snp-tdx-threat-model.rst
 
 CONFIGFS
-M:	Joel Becker <jlbec@evilplan.org>
+M:	Andreas Hindborg <a.hindborg@kernel.org>
+R:	Breno Leitao <leitao@debian.org>
 S:	Supported
-T:	git git://git.infradead.org/users/hch/configfs.git
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/a.hindborg/linux.git configfs-next
 F:	fs/configfs/
 F:	include/linux/configfs.h
 F:	samples/configfs/

From 9e6901f17a719650be376f04d742bdbe1d7094ce Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Wed, 26 Mar 2025 18:17:44 -0600
Subject: [PATCH 09/10] fs: namespace: Avoid -Wflex-array-member-not-at-end
 warning

-Wflex-array-member-not-at-end was introduced in GCC-14, and we are
getting ready to enable it, globally.

Move the conflicting declaration to the end of the structure. Notice
that `struct statmount` is a flexible structure --a structure that
contains a flexible-array member.

Fix the following warning:

fs/namespace.c:5329:26: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]

Signed-off-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/Z-SZKNdCiAkVJvqm@kspp
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 6100e5b962a6..16292ff760c9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5326,8 +5326,10 @@ struct kstatmount {
 	struct mnt_idmap *idmap;
 	u64 mask;
 	struct path root;
-	struct statmount sm;
 	struct seq_file seq;
+
+	/* Must be last --ends in a flexible-array member. */
+	struct statmount sm;
 };
 
 static u64 mnt_to_attr_flags(struct vfsmount *mnt)

From 923936efeb74b3f42e5ad283a0b9110bda102601 Mon Sep 17 00:00:00 2001
From: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Date: Fri, 28 Mar 2025 01:01:19 +0800
Subject: [PATCH 10/10] iomap: Fix conflicting values of iomap flags

IOMAP_F_ATOMIC_BIO mistakenly took the same value as of IOMAP_F_SIZE_CHANGED
in patch '370a6de7651b ("iomap: rework IOMAP atomic flags")'.
Let's fix this and let's also create some more space for filesystem reported
flags to avoid this in future. This patch makes the core iomap flags to start
from bit 15, moving downwards. Note that "flags" member within struct iomap
is of type u16.

Fixes: 370a6de7651b ("iomap: rework IOMAP atomic flags")
Signed-off-by: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Link: https://lore.kernel.org/r/20250327170119.61045-1-ritesh.list@gmail.com
Reviewed-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/iomap.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 02fe001feebb..68416b135151 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -78,6 +78,11 @@ struct vm_fault;
 #define IOMAP_F_ANON_WRITE	(1U << 7)
 #define IOMAP_F_ATOMIC_BIO	(1U << 8)
 
+/*
+ * Flag reserved for file system specific usage
+ */
+#define IOMAP_F_PRIVATE		(1U << 12)
+
 /*
  * Flags set by the core iomap code during operations:
  *
@@ -88,14 +93,8 @@ struct vm_fault;
  * range it covers needs to be remapped by the high level before the operation
  * can proceed.
  */
-#define IOMAP_F_SIZE_CHANGED	(1U << 8)
-#define IOMAP_F_STALE		(1U << 9)
-
-/*
- * Flags from 0x1000 up are for file system specific usage:
- */
-#define IOMAP_F_PRIVATE		(1U << 12)
-
+#define IOMAP_F_SIZE_CHANGED	(1U << 14)
+#define IOMAP_F_STALE		(1U << 15)
 
 /*
  * Magic value for addr: