From 12ae2c81b21cfaa193db2faf035d495807edc3a7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:59 +0100 Subject: [PATCH 1/9] clone: add CLONE_AUTOREAP Add a new clone3() flag CLONE_AUTOREAP that makes a child process auto-reap on exit without ever becoming a zombie. This is a per-process property in contrast to the existing auto-reap mechanism via SA_NOCLDWAIT or SIG_IGN for SIGCHLD which applies to all children of a given parent. Currently the only way to automatically reap children is to set SA_NOCLDWAIT or SIG_IGN on SIGCHLD. This is a parent-scoped property affecting all children which makes it unsuitable for libraries or applications that need selective auto-reaping of specific children while still being able to wait() on others. CLONE_AUTOREAP stores an autoreap flag in the child's signal_struct. When the child exits do_notify_parent() checks this flag and causes exit_notify() to transition the task directly to EXIT_DEAD. Since the flag lives on the child it survives reparenting: if the original parent exits and the child is reparented to a subreaper or init the child still auto-reaps when it eventually exits. CLONE_AUTOREAP can be combined with CLONE_PIDFD to allow the parent to monitor the child's exit via poll() and retrieve exit status via PIDFD_GET_INFO. Without CLONE_PIDFD it provides a fire-and-forget pattern where the parent simply doesn't care about the child's exit status. No exit signal is delivered so exit_signal must be zero. CLONE_AUTOREAP is rejected in combination with CLONE_PARENT. If a CLONE_AUTOREAP child were to clone(CLONE_PARENT) the new grandchild would inherit exit_signal == 0 from the autoreap parent's group leader but without signal->autoreap. This grandchild would become a zombie that never sends a signal and is never autoreaped - confusing and arguably broken behavior. The flag is not inherited by the autoreap process's own children. Each child that should be autoreaped must be explicitly created with CLONE_AUTOREAP. Link: https://github.com/uapi-group/kernel-features/issues/45 Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-1-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/linux/sched/signal.h | 1 + include/uapi/linux/sched.h | 5 +++-- kernel/fork.c | 17 ++++++++++++++++- kernel/ptrace.c | 3 ++- kernel/signal.c | 4 ++++ 5 files changed, 26 insertions(+), 4 deletions(-) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a22248aebcf9..f842c86b806f 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -132,6 +132,7 @@ struct signal_struct { */ unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; + unsigned int autoreap:1; #ifdef CONFIG_POSIX_TIMERS diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 359a14cc76a4..69f7b4f9eb0c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -34,8 +34,9 @@ #define CLONE_IO 0x80000000 /* Clone io context */ /* Flags for the clone3() syscall. */ -#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ -#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ +#define CLONE_CLEAR_SIGHAND (1ULL << 32) /* Clear any signal handler and reset to SIG_DFL. */ +#define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */ +#define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 diff --git a/kernel/fork.c b/kernel/fork.c index e832da9d15a4..10549574fda6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2028,6 +2028,18 @@ __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_AUTOREAP) { + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + if (clone_flags & CLONE_PARENT) + return ERR_PTR(-EINVAL); + if (args->exit_signal) + return ERR_PTR(-EINVAL); + } + + if ((clone_flags & CLONE_PARENT) && current->signal->autoreap) + return ERR_PTR(-EINVAL); + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -2435,6 +2447,8 @@ __latent_entropy struct task_struct *copy_process( */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; + if (clone_flags & CLONE_AUTOREAP) + p->signal->autoreap = 1; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_TGID); @@ -2897,7 +2911,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ if (kargs->flags & - ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP | + CLONE_AUTOREAP)) return false; /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 392ec2f75f01..68c17daef8d4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -549,7 +549,8 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) if (!dead && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) dead = do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) { + else if (ignoring_children(tracer->sighand) || + p->signal->autoreap) { __wake_up_parent(p, tracer); dead = true; } diff --git a/kernel/signal.c b/kernel/signal.c index d65d0fe24bfb..e61f39fa8c8a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2251,6 +2251,10 @@ bool do_notify_parent(struct task_struct *tsk, int sig) if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) sig = 0; } + if (!tsk->ptrace && tsk->signal->autoreap) { + autoreap = true; + sig = 0; + } /* * Send with __send_signal as si_pid and si_uid are in the * parent's namespaces. From 24baca56fafc33d4fb77cd9858a48c734183cb22 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:51:00 +0100 Subject: [PATCH 2/9] clone: add CLONE_NNP Add a new clone3() flag CLONE_NNP that sets no_new_privs on the child process at clone time. This is analogous to prctl(PR_SET_NO_NEW_PRIVS) but applied at process creation rather than requiring a separate step after the child starts running. CLONE_NNP is rejected with CLONE_THREAD. It's conceptually a lot simpler if the whole thread-group is forced into NNP and not have single threads running around with NNP. Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-2-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 1 + kernel/fork.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 69f7b4f9eb0c..386c8d7e89cb 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -37,6 +37,7 @@ #define CLONE_CLEAR_SIGHAND (1ULL << 32) /* Clear any signal handler and reset to SIG_DFL. */ #define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */ #define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */ +#define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 diff --git a/kernel/fork.c b/kernel/fork.c index 10549574fda6..736798e4005a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2040,6 +2040,11 @@ __latent_entropy struct task_struct *copy_process( if ((clone_flags & CLONE_PARENT) && current->signal->autoreap) return ERR_PTR(-EINVAL); + if (clone_flags & CLONE_NNP) { + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + } + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -2424,6 +2429,9 @@ __latent_entropy struct task_struct *copy_process( */ copy_seccomp(p); + if (clone_flags & CLONE_NNP) + task_set_no_new_privs(p); + init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -2912,7 +2920,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) /* Verify that no unknown flags are passed along. */ if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP | - CLONE_AUTOREAP)) + CLONE_AUTOREAP | CLONE_NNP)) return false; /* From c8134b5f13ae959de2b3c8cc278e2602b0857345 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:51:01 +0100 Subject: [PATCH 3/9] pidfd: add CLONE_PIDFD_AUTOKILL Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's lifetime to the pidfd returned from clone3(). When the last reference to the struct file created by clone3() is closed the kernel sends SIGKILL to the child. A pidfd obtained via pidfd_open() for the same process does not keep the child alive and does not trigger autokill - only the specific struct file from clone3() has this property. This is useful for container runtimes, service managers, and sandboxed subprocess execution - any scenario where the child must die if the parent crashes or abandons the pidfd. CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD (the whole point is tying lifetime to the pidfd file) and CLONE_AUTOREAP (a killed child with no one to reap it would become a zombie). CLONE_THREAD is rejected because autokill targets a process not a thread. The clone3 pidfd is identified by the PIDFD_AUTOKILL file flag set on the struct file at clone3() time. The pidfs .release handler checks this flag and sends SIGKILL via do_send_sig_info(SIGKILL, SEND_SIG_PRIV, ...) only when it is set. Files from pidfd_open() or open_by_handle_at() are distinct struct files that do not carry this flag. dup()/fork() share the same struct file so they extend the child's lifetime until the last reference drops. CLONE_PIDFD_AUTOKILL uses a privilege model based on CLONE_NNP: without CLONE_NNP the child could escalate privileges via setuid/setgid exec after being spawned, so the caller must have CAP_SYS_ADMIN in its user namespace. With CLONE_NNP the child can never gain new privileges so unprivileged usage is allowed. This is a deliberate departure from the pdeath_signal model which is reset during secureexec and commit_creds() rendering it useless for container runtimes that need to deprivilege themselves. Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-3-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- fs/pidfs.c | 38 ++++++++++++++++++++++++++++++++------ include/uapi/linux/pidfd.h | 1 + include/uapi/linux/sched.h | 1 + kernel/fork.c | 29 ++++++++++++++++++++++++++--- 4 files changed, 60 insertions(+), 9 deletions(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index 318253344b5c..a8d1bca0395d 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include #include #include @@ -637,7 +639,28 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return open_namespace(ns_common); } +static int pidfs_file_release(struct inode *inode, struct file *file) +{ + struct pid *pid = inode->i_private; + struct task_struct *task; + + if (!(file->f_flags & PIDFD_AUTOKILL)) + return 0; + + guard(rcu)(); + task = pid_task(pid, PIDTYPE_TGID); + if (!task) + return 0; + + /* Not available for kthreads or user workers for now. */ + if (WARN_ON_ONCE(task->flags & (PF_KTHREAD | PF_USER_WORKER))) + return 0; + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_TGID); + return 0; +} + static const struct file_operations pidfs_file_operations = { + .release = pidfs_file_release, .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, @@ -1093,11 +1116,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) int ret; /* - * Ensure that PIDFD_STALE can be passed as a flag without - * overloading other uapi pidfd flags. + * Ensure that internal pidfd flags don't overlap with each + * other or with uapi pidfd flags. */ - BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD); - BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK); + BUILD_BUG_ON(hweight32(PIDFD_THREAD | PIDFD_NONBLOCK | + PIDFD_STALE | PIDFD_AUTOKILL) != 4); ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); if (ret < 0) @@ -1108,9 +1131,12 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) flags &= ~PIDFD_STALE; flags |= O_RDWR; pidfd_file = dentry_open(&path, flags, current_cred()); - /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */ + /* + * Raise PIDFD_THREAD and PIDFD_AUTOKILL explicitly as + * do_dentry_open() strips O_EXCL and O_TRUNC. + */ if (!IS_ERR(pidfd_file)) - pidfd_file->f_flags |= (flags & PIDFD_THREAD); + pidfd_file->f_flags |= (flags & (PIDFD_THREAD | PIDFD_AUTOKILL)); return pidfd_file; } diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index ea9a6811fc76..9281956a9f32 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -13,6 +13,7 @@ #ifdef __KERNEL__ #include #define PIDFD_STALE CLONE_PIDFD +#define PIDFD_AUTOKILL O_TRUNC #endif /* Flags for pidfd_send_signal(). */ diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 386c8d7e89cb..149dbc64923b 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -38,6 +38,7 @@ #define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */ #define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */ #define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */ +#define CLONE_PIDFD_AUTOKILL (1ULL << 36) /* Kill child when clone pidfd closes. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 diff --git a/kernel/fork.c b/kernel/fork.c index 736798e4005a..99a6cb4e7ab0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2045,6 +2045,24 @@ __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_PIDFD_AUTOKILL) { + if (!(clone_flags & CLONE_PIDFD)) + return ERR_PTR(-EINVAL); + if (!(clone_flags & CLONE_AUTOREAP)) + return ERR_PTR(-EINVAL); + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + /* + * Without CLONE_NNP the child could escalate privileges + * after being spawned, so require CAP_SYS_ADMIN. + * With CLONE_NNP the child can't gain new privileges, + * so allow unprivileged usage. + */ + if (!(clone_flags & CLONE_NNP) && + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -2267,13 +2285,18 @@ __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { - int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; + unsigned flags = PIDFD_STALE; + + if (clone_flags & CLONE_THREAD) + flags |= PIDFD_THREAD; + if (clone_flags & CLONE_PIDFD_AUTOKILL) + flags |= PIDFD_AUTOKILL; /* * Note that no task has been attached to @pid yet indicate * that via CLONE_PIDFD. */ - retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); + retval = pidfd_prepare(pid, flags, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; @@ -2920,7 +2943,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) /* Verify that no unknown flags are passed along. */ if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP | - CLONE_AUTOREAP | CLONE_NNP)) + CLONE_AUTOREAP | CLONE_NNP | CLONE_PIDFD_AUTOKILL)) return false; /* From 76d46ad2c52a4d7631274a35777ac4601103e2aa Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:51:02 +0100 Subject: [PATCH 4/9] selftests/pidfd: add CLONE_AUTOREAP tests Add tests for the new CLONE_AUTOREAP clone3() flag: - autoreap_without_pidfd: CLONE_AUTOREAP without CLONE_PIDFD works (fire-and-forget) - autoreap_rejects_exit_signal: CLONE_AUTOREAP with non-zero exit_signal fails - autoreap_rejects_parent: CLONE_AUTOREAP with CLONE_PARENT fails - autoreap_rejects_thread: CLONE_AUTOREAP with CLONE_THREAD fails - autoreap_basic: child exits, pidfd poll works, PIDFD_GET_INFO returns correct exit code, waitpid() returns -ECHILD - autoreap_signaled: child killed by signal, exit info correct via pidfd - autoreap_reparent: autoreap grandchild reparented to subreaper still auto-reaps - autoreap_multithreaded: autoreap process with sub-threads auto-reaps after last thread exits - autoreap_no_inherit: grandchild forked without CLONE_AUTOREAP becomes a regular zombie Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-4-d148b984a989@kernel.org Signed-off-by: Christian Brauner --- tools/testing/selftests/pidfd/.gitignore | 1 + tools/testing/selftests/pidfd/Makefile | 2 +- .../selftests/pidfd/pidfd_autoreap_test.c | 496 ++++++++++++++++++ 3 files changed, 498 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/pidfd/pidfd_autoreap_test.c diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore index 144e7ff65d6a..4cd8ec7fd349 100644 --- a/tools/testing/selftests/pidfd/.gitignore +++ b/tools/testing/selftests/pidfd/.gitignore @@ -12,3 +12,4 @@ pidfd_info_test pidfd_exec_helper pidfd_xattr_test pidfd_setattr_test +pidfd_autoreap_test diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index 764a8f9ecefa..4211f91e9af8 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -4,7 +4,7 @@ CFLAGS += -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -pthread -Wall TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \ pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \ pidfd_file_handle_test pidfd_bind_mount pidfd_info_test \ - pidfd_xattr_test pidfd_setattr_test + pidfd_xattr_test pidfd_setattr_test pidfd_autoreap_test TEST_GEN_PROGS_EXTENDED := pidfd_exec_helper diff --git a/tools/testing/selftests/pidfd/pidfd_autoreap_test.c b/tools/testing/selftests/pidfd/pidfd_autoreap_test.c new file mode 100644 index 000000000000..22bdc04c7dd0 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_autoreap_test.c @@ -0,0 +1,496 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2026 Christian Brauner + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pidfd.h" +#include "kselftest_harness.h" + +#ifndef CLONE_AUTOREAP +#define CLONE_AUTOREAP (1ULL << 34) +#endif + +static pid_t create_autoreap_child(int *pidfd) +{ + struct __clone_args args = { + .flags = CLONE_PIDFD | CLONE_AUTOREAP, + .exit_signal = 0, + .pidfd = ptr_to_u64(pidfd), + }; + + return sys_clone3(&args, sizeof(args)); +} + +/* + * Test that CLONE_AUTOREAP works without CLONE_PIDFD (fire-and-forget). + */ +TEST(autoreap_without_pidfd) +{ + struct __clone_args args = { + .flags = CLONE_AUTOREAP, + .exit_signal = 0, + }; + pid_t pid; + int ret; + + pid = sys_clone3(&args, sizeof(args)); + if (pid < 0 && errno == EINVAL) + SKIP(return, "CLONE_AUTOREAP not supported"); + ASSERT_GE(pid, 0); + + if (pid == 0) + _exit(0); + + /* + * Give the child a moment to exit and be autoreaped. + * Then verify no zombie remains. + */ + usleep(200000); + ret = waitpid(pid, NULL, WNOHANG); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ECHILD); +} + +/* + * Test that CLONE_AUTOREAP with a non-zero exit_signal fails. + */ +TEST(autoreap_rejects_exit_signal) +{ + struct __clone_args args = { + .flags = CLONE_AUTOREAP, + .exit_signal = SIGCHLD, + }; + pid_t pid; + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_EQ(pid, -1); + ASSERT_EQ(errno, EINVAL); +} + +/* + * Test that CLONE_AUTOREAP with CLONE_PARENT fails. + */ +TEST(autoreap_rejects_parent) +{ + struct __clone_args args = { + .flags = CLONE_AUTOREAP | CLONE_PARENT, + .exit_signal = 0, + }; + pid_t pid; + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_EQ(pid, -1); + ASSERT_EQ(errno, EINVAL); +} + +/* + * Test that CLONE_AUTOREAP with CLONE_THREAD fails. + */ +TEST(autoreap_rejects_thread) +{ + struct __clone_args args = { + .flags = CLONE_AUTOREAP | CLONE_THREAD | + CLONE_SIGHAND | CLONE_VM, + .exit_signal = 0, + }; + pid_t pid; + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_EQ(pid, -1); + ASSERT_EQ(errno, EINVAL); +} + +/* + * Basic test: create an autoreap child, let it exit, verify: + * - pidfd becomes readable (poll returns POLLIN) + * - PIDFD_GET_INFO returns the correct exit code + * - waitpid() returns -1/ECHILD (no zombie) + */ +TEST(autoreap_basic) +{ + struct pidfd_info info = { .mask = PIDFD_INFO_EXIT }; + int pidfd = -1, ret; + struct pollfd pfd; + pid_t pid; + + pid = create_autoreap_child(&pidfd); + if (pid < 0 && errno == EINVAL) + SKIP(return, "CLONE_AUTOREAP not supported"); + ASSERT_GE(pid, 0); + + if (pid == 0) + _exit(42); + + ASSERT_GE(pidfd, 0); + + /* Wait for the child to exit via pidfd poll. */ + pfd.fd = pidfd; + pfd.events = POLLIN; + ret = poll(&pfd, 1, 5000); + ASSERT_EQ(ret, 1); + ASSERT_TRUE(pfd.revents & POLLIN); + + /* Verify exit info via PIDFD_GET_INFO. */ + ret = ioctl(pidfd, PIDFD_GET_INFO, &info); + ASSERT_EQ(ret, 0); + ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT); + /* + * exit_code is in waitpid format: for _exit(42), + * WIFEXITED is true and WEXITSTATUS is 42. + */ + ASSERT_TRUE(WIFEXITED(info.exit_code)); + ASSERT_EQ(WEXITSTATUS(info.exit_code), 42); + + /* Verify no zombie: waitpid should fail with ECHILD. */ + ret = waitpid(pid, NULL, WNOHANG); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ECHILD); + + close(pidfd); +} + +/* + * Test that an autoreap child killed by a signal reports + * the correct exit info. + */ +TEST(autoreap_signaled) +{ + struct pidfd_info info = { .mask = PIDFD_INFO_EXIT }; + int pidfd = -1, ret; + struct pollfd pfd; + pid_t pid; + + pid = create_autoreap_child(&pidfd); + if (pid < 0 && errno == EINVAL) + SKIP(return, "CLONE_AUTOREAP not supported"); + ASSERT_GE(pid, 0); + + if (pid == 0) { + pause(); + _exit(1); + } + + ASSERT_GE(pidfd, 0); + + /* Kill the child. */ + ret = sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0); + ASSERT_EQ(ret, 0); + + /* Wait for exit via pidfd. */ + pfd.fd = pidfd; + pfd.events = POLLIN; + ret = poll(&pfd, 1, 5000); + ASSERT_EQ(ret, 1); + ASSERT_TRUE(pfd.revents & POLLIN); + + /* Verify signal info. */ + ret = ioctl(pidfd, PIDFD_GET_INFO, &info); + ASSERT_EQ(ret, 0); + ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT); + ASSERT_TRUE(WIFSIGNALED(info.exit_code)); + ASSERT_EQ(WTERMSIG(info.exit_code), SIGKILL); + + /* No zombie. */ + ret = waitpid(pid, NULL, WNOHANG); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ECHILD); + + close(pidfd); +} + +/* + * Test autoreap survives reparenting: middle process creates an + * autoreap grandchild, then exits. The grandchild gets reparented + * to us (the grandparent, which is a subreaper). When the grandchild + * exits, it should still be autoreaped - no zombie under us. + */ +TEST(autoreap_reparent) +{ + int ipc_sockets[2], ret; + int pidfd = -1; + struct pollfd pfd; + pid_t mid_pid, grandchild_pid; + char buf[32] = {}; + + /* Make ourselves a subreaper so reparented children come to us. */ + ret = prctl(PR_SET_CHILD_SUBREAPER, 1); + ASSERT_EQ(ret, 0); + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + mid_pid = fork(); + ASSERT_GE(mid_pid, 0); + + if (mid_pid == 0) { + /* Middle child: create an autoreap grandchild. */ + int gc_pidfd = -1; + + close(ipc_sockets[0]); + + grandchild_pid = create_autoreap_child(&gc_pidfd); + if (grandchild_pid < 0) { + write_nointr(ipc_sockets[1], "E", 1); + close(ipc_sockets[1]); + _exit(1); + } + + if (grandchild_pid == 0) { + /* Grandchild: wait for signal to exit. */ + close(ipc_sockets[1]); + if (gc_pidfd >= 0) + close(gc_pidfd); + pause(); + _exit(0); + } + + /* Send grandchild PID to grandparent. */ + snprintf(buf, sizeof(buf), "%d", grandchild_pid); + write_nointr(ipc_sockets[1], buf, strlen(buf)); + close(ipc_sockets[1]); + if (gc_pidfd >= 0) + close(gc_pidfd); + + /* Middle child exits, grandchild gets reparented. */ + _exit(0); + } + + close(ipc_sockets[1]); + + /* Read grandchild's PID. */ + ret = read_nointr(ipc_sockets[0], buf, sizeof(buf) - 1); + close(ipc_sockets[0]); + ASSERT_GT(ret, 0); + + if (buf[0] == 'E') { + waitpid(mid_pid, NULL, 0); + prctl(PR_SET_CHILD_SUBREAPER, 0); + SKIP(return, "CLONE_AUTOREAP not supported"); + } + + grandchild_pid = atoi(buf); + ASSERT_GT(grandchild_pid, 0); + + /* Wait for the middle child to exit. */ + ret = waitpid(mid_pid, NULL, 0); + ASSERT_EQ(ret, mid_pid); + + /* + * Now the grandchild is reparented to us (subreaper). + * Open a pidfd for the grandchild and kill it. + */ + pidfd = sys_pidfd_open(grandchild_pid, 0); + ASSERT_GE(pidfd, 0); + + ret = sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0); + ASSERT_EQ(ret, 0); + + /* Wait for it to exit via pidfd poll. */ + pfd.fd = pidfd; + pfd.events = POLLIN; + ret = poll(&pfd, 1, 5000); + ASSERT_EQ(ret, 1); + ASSERT_TRUE(pfd.revents & POLLIN); + + /* + * The grandchild should have been autoreaped even though + * we (the new parent) haven't set SA_NOCLDWAIT. + * waitpid should return -1/ECHILD. + */ + ret = waitpid(grandchild_pid, NULL, WNOHANG); + EXPECT_EQ(ret, -1); + EXPECT_EQ(errno, ECHILD); + + close(pidfd); + + /* Clean up subreaper status. */ + prctl(PR_SET_CHILD_SUBREAPER, 0); +} + +static int thread_sock_fd; + +static void *thread_func(void *arg) +{ + /* Signal parent we're running. */ + write_nointr(thread_sock_fd, "1", 1); + + /* Give main thread time to call _exit() first. */ + usleep(200000); + + return NULL; +} + +/* + * Test that an autoreap child with multiple threads is properly + * autoreaped only after all threads have exited. + */ +TEST(autoreap_multithreaded) +{ + struct pidfd_info info = { .mask = PIDFD_INFO_EXIT }; + int ipc_sockets[2], ret; + int pidfd = -1; + struct pollfd pfd; + pid_t pid; + char c; + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid = create_autoreap_child(&pidfd); + if (pid < 0 && errno == EINVAL) { + close(ipc_sockets[0]); + close(ipc_sockets[1]); + SKIP(return, "CLONE_AUTOREAP not supported"); + } + ASSERT_GE(pid, 0); + + if (pid == 0) { + pthread_t thread; + + close(ipc_sockets[0]); + + /* + * Create a sub-thread that outlives the main thread. + * The thread signals readiness, then sleeps. + * The main thread waits briefly, then calls _exit(). + */ + thread_sock_fd = ipc_sockets[1]; + pthread_create(&thread, NULL, thread_func, NULL); + pthread_detach(thread); + + /* Wait for thread to be running. */ + usleep(100000); + + /* Main thread exits; sub-thread is still alive. */ + _exit(99); + } + + close(ipc_sockets[1]); + + /* Wait for the sub-thread to signal readiness. */ + ret = read_nointr(ipc_sockets[0], &c, 1); + close(ipc_sockets[0]); + ASSERT_EQ(ret, 1); + + /* Wait for the process to fully exit via pidfd poll. */ + pfd.fd = pidfd; + pfd.events = POLLIN; + ret = poll(&pfd, 1, 5000); + ASSERT_EQ(ret, 1); + ASSERT_TRUE(pfd.revents & POLLIN); + + /* Verify exit info. */ + ret = ioctl(pidfd, PIDFD_GET_INFO, &info); + ASSERT_EQ(ret, 0); + ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT); + ASSERT_TRUE(WIFEXITED(info.exit_code)); + ASSERT_EQ(WEXITSTATUS(info.exit_code), 99); + + /* No zombie. */ + ret = waitpid(pid, NULL, WNOHANG); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ECHILD); + + close(pidfd); +} + +/* + * Test that autoreap is NOT inherited by grandchildren. + */ +TEST(autoreap_no_inherit) +{ + int ipc_sockets[2], ret; + int pidfd = -1; + pid_t pid; + char buf[2] = {}; + struct pollfd pfd; + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid = create_autoreap_child(&pidfd); + if (pid < 0 && errno == EINVAL) { + close(ipc_sockets[0]); + close(ipc_sockets[1]); + SKIP(return, "CLONE_AUTOREAP not supported"); + } + ASSERT_GE(pid, 0); + + if (pid == 0) { + pid_t gc; + int status; + + close(ipc_sockets[0]); + + /* Autoreap child forks a grandchild (without autoreap). */ + gc = fork(); + if (gc < 0) { + write_nointr(ipc_sockets[1], "E", 1); + _exit(1); + } + if (gc == 0) { + /* Grandchild: exit immediately. */ + close(ipc_sockets[1]); + _exit(77); + } + + /* + * The grandchild should become a regular zombie + * since it was NOT created with CLONE_AUTOREAP. + * Wait for it to verify. + */ + ret = waitpid(gc, &status, 0); + if (ret == gc && WIFEXITED(status) && + WEXITSTATUS(status) == 77) { + write_nointr(ipc_sockets[1], "P", 1); + } else { + write_nointr(ipc_sockets[1], "F", 1); + } + close(ipc_sockets[1]); + _exit(0); + } + + close(ipc_sockets[1]); + + ret = read_nointr(ipc_sockets[0], buf, 1); + close(ipc_sockets[0]); + ASSERT_EQ(ret, 1); + + /* + * 'P' means the autoreap child was able to waitpid() its + * grandchild (correct - grandchild should be a normal zombie, + * not autoreaped). + */ + ASSERT_EQ(buf[0], 'P'); + + /* Wait for the autoreap child to exit. */ + pfd.fd = pidfd; + pfd.events = POLLIN; + ret = poll(&pfd, 1, 5000); + ASSERT_EQ(ret, 1); + + /* Autoreap child itself should be autoreaped. */ + ret = waitpid(pid, NULL, WNOHANG); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ECHILD); + + close(pidfd); +} + +TEST_HARNESS_MAIN From 2a4d85aa1c0a894d962a15dc75e8489f1e91f5f6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:51:03 +0100 Subject: [PATCH 5/9] selftests/pidfd: add CLONE_NNP tests Add tests for the new CLONE_NNP flag: - nnp_sets_no_new_privs: Verify a child created with CLONE_NNP has no_new_privs set while the parent does not. - nnp_rejects_thread: Verify CLONE_NNP | CLONE_THREAD is rejected with -EINVAL since threads share credentials. - autoreap_no_new_privs_unset: Verify a plain CLONE_AUTOREAP child does not get no_new_privs. Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-5-d148b984a989@kernel.org Signed-off-by: Christian Brauner --- .../selftests/pidfd/pidfd_autoreap_test.c | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/tools/testing/selftests/pidfd/pidfd_autoreap_test.c b/tools/testing/selftests/pidfd/pidfd_autoreap_test.c index 22bdc04c7dd0..d1f3882f7d6e 100644 --- a/tools/testing/selftests/pidfd/pidfd_autoreap_test.c +++ b/tools/testing/selftests/pidfd/pidfd_autoreap_test.c @@ -26,6 +26,10 @@ #define CLONE_AUTOREAP (1ULL << 34) #endif +#ifndef CLONE_NNP +#define CLONE_NNP (1ULL << 35) +#endif + static pid_t create_autoreap_child(int *pidfd) { struct __clone_args args = { @@ -493,4 +497,126 @@ TEST(autoreap_no_inherit) close(pidfd); } +/* + * Test that CLONE_NNP sets no_new_privs on the child. + * The child checks via prctl(PR_GET_NO_NEW_PRIVS) and reports back. + * The parent must NOT have no_new_privs set afterwards. + */ +TEST(nnp_sets_no_new_privs) +{ + struct __clone_args args = { + .flags = CLONE_PIDFD | CLONE_AUTOREAP | CLONE_NNP, + .exit_signal = 0, + }; + struct pidfd_info info = { .mask = PIDFD_INFO_EXIT }; + int pidfd = -1, ret; + struct pollfd pfd; + pid_t pid; + + /* Ensure parent does not already have no_new_privs. */ + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + ASSERT_EQ(ret, 0) { + TH_LOG("Parent already has no_new_privs set, cannot run test"); + } + + args.pidfd = ptr_to_u64(&pidfd); + + pid = sys_clone3(&args, sizeof(args)); + if (pid < 0 && errno == EINVAL) + SKIP(return, "CLONE_NNP not supported"); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* + * Child: check no_new_privs. Exit 0 if set, 1 if not. + */ + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + _exit(ret == 1 ? 0 : 1); + } + + ASSERT_GE(pidfd, 0); + + /* Parent must still NOT have no_new_privs. */ + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + ASSERT_EQ(ret, 0) { + TH_LOG("Parent got no_new_privs after creating CLONE_NNP child"); + } + + /* Wait for child to exit. */ + pfd.fd = pidfd; + pfd.events = POLLIN; + ret = poll(&pfd, 1, 5000); + ASSERT_EQ(ret, 1); + + /* Verify child exited with 0 (no_new_privs was set). */ + ret = ioctl(pidfd, PIDFD_GET_INFO, &info); + ASSERT_EQ(ret, 0); + ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT); + ASSERT_TRUE(WIFEXITED(info.exit_code)); + ASSERT_EQ(WEXITSTATUS(info.exit_code), 0) { + TH_LOG("Child did not have no_new_privs set"); + } + + close(pidfd); +} + +/* + * Test that CLONE_NNP with CLONE_THREAD fails with EINVAL. + */ +TEST(nnp_rejects_thread) +{ + struct __clone_args args = { + .flags = CLONE_NNP | CLONE_THREAD | + CLONE_SIGHAND | CLONE_VM, + .exit_signal = 0, + }; + pid_t pid; + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_EQ(pid, -1); + ASSERT_EQ(errno, EINVAL); +} + +/* + * Test that a plain CLONE_AUTOREAP child does NOT get no_new_privs. + * Only CLONE_NNP should set it. + */ +TEST(autoreap_no_new_privs_unset) +{ + struct pidfd_info info = { .mask = PIDFD_INFO_EXIT }; + int pidfd = -1, ret; + struct pollfd pfd; + pid_t pid; + + pid = create_autoreap_child(&pidfd); + if (pid < 0 && errno == EINVAL) + SKIP(return, "CLONE_AUTOREAP not supported"); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* + * Child: check no_new_privs. Exit 0 if NOT set, 1 if set. + */ + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + _exit(ret == 0 ? 0 : 1); + } + + ASSERT_GE(pidfd, 0); + + pfd.fd = pidfd; + pfd.events = POLLIN; + ret = poll(&pfd, 1, 5000); + ASSERT_EQ(ret, 1); + + ret = ioctl(pidfd, PIDFD_GET_INFO, &info); + ASSERT_EQ(ret, 0); + ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT); + ASSERT_TRUE(WIFEXITED(info.exit_code)); + ASSERT_EQ(WEXITSTATUS(info.exit_code), 0) { + TH_LOG("Plain autoreap child unexpectedly has no_new_privs"); + } + + close(pidfd); +} + TEST_HARNESS_MAIN From ec26879e6d89983b31fdb27d149854f42ee8d689 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:51:04 +0100 Subject: [PATCH 6/9] selftests/pidfd: add CLONE_PIDFD_AUTOKILL tests Add tests for CLONE_PIDFD_AUTOKILL: - autokill_basic: Verify closing the clone3 pidfd kills the child. - autokill_requires_pidfd: Verify AUTOKILL without CLONE_PIDFD fails. - autokill_requires_autoreap: Verify AUTOKILL without CLONE_AUTOREAP fails. - autokill_rejects_thread: Verify AUTOKILL with CLONE_THREAD fails. - autokill_pidfd_open_no_effect: Verify only the clone3 pidfd triggers autokill, not pidfd_open(). - autokill_requires_cap_sys_admin: Verify AUTOKILL without CLONE_NNP fails with -EPERM for an unprivileged caller. - autokill_without_nnp_with_cap: Verify AUTOKILL without CLONE_NNP succeeds with CAP_SYS_ADMIN. Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-6-d148b984a989@kernel.org Signed-off-by: Christian Brauner --- .../selftests/pidfd/pidfd_autoreap_test.c | 278 ++++++++++++++++++ 1 file changed, 278 insertions(+) diff --git a/tools/testing/selftests/pidfd/pidfd_autoreap_test.c b/tools/testing/selftests/pidfd/pidfd_autoreap_test.c index d1f3882f7d6e..1c586482f25b 100644 --- a/tools/testing/selftests/pidfd/pidfd_autoreap_test.c +++ b/tools/testing/selftests/pidfd/pidfd_autoreap_test.c @@ -30,6 +30,33 @@ #define CLONE_NNP (1ULL << 35) #endif +#ifndef CLONE_PIDFD_AUTOKILL +#define CLONE_PIDFD_AUTOKILL (1ULL << 36) +#endif + +#ifndef _LINUX_CAPABILITY_VERSION_3 +#define _LINUX_CAPABILITY_VERSION_3 0x20080522 +#endif + +struct cap_header { + __u32 version; + int pid; +}; + +struct cap_data { + __u32 effective; + __u32 permitted; + __u32 inheritable; +}; + +static int drop_all_caps(void) +{ + struct cap_header hdr = { .version = _LINUX_CAPABILITY_VERSION_3 }; + struct cap_data data[2] = {}; + + return syscall(__NR_capset, &hdr, data); +} + static pid_t create_autoreap_child(int *pidfd) { struct __clone_args args = { @@ -619,4 +646,255 @@ TEST(autoreap_no_new_privs_unset) close(pidfd); } +/* + * Helper: create a child with CLONE_PIDFD | CLONE_PIDFD_AUTOKILL | CLONE_AUTOREAP | CLONE_NNP. + */ +static pid_t create_autokill_child(int *pidfd) +{ + struct __clone_args args = { + .flags = CLONE_PIDFD | CLONE_PIDFD_AUTOKILL | + CLONE_AUTOREAP | CLONE_NNP, + .exit_signal = 0, + .pidfd = ptr_to_u64(pidfd), + }; + + return sys_clone3(&args, sizeof(args)); +} + +/* + * Basic autokill test: child blocks in pause(), parent closes the + * clone3 pidfd, child should be killed and autoreaped. + */ +TEST(autokill_basic) +{ + int pidfd = -1, pollfd_fd = -1, ret; + struct pollfd pfd; + pid_t pid; + + pid = create_autokill_child(&pidfd); + if (pid < 0 && errno == EINVAL) + SKIP(return, "CLONE_PIDFD_AUTOKILL not supported"); + ASSERT_GE(pid, 0); + + if (pid == 0) { + pause(); + _exit(1); + } + + ASSERT_GE(pidfd, 0); + + /* + * Open a second pidfd via pidfd_open() so we can observe the + * child's death after closing the clone3 pidfd. + */ + pollfd_fd = sys_pidfd_open(pid, 0); + ASSERT_GE(pollfd_fd, 0); + + /* Close the clone3 pidfd — this should trigger autokill. */ + close(pidfd); + + /* Wait for the child to die via the pidfd_open'd fd. */ + pfd.fd = pollfd_fd; + pfd.events = POLLIN; + ret = poll(&pfd, 1, 5000); + ASSERT_EQ(ret, 1); + ASSERT_TRUE(pfd.revents & POLLIN); + + /* Child should be autoreaped — no zombie. */ + usleep(100000); + ret = waitpid(pid, NULL, WNOHANG); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ECHILD); + + close(pollfd_fd); +} + +/* + * CLONE_PIDFD_AUTOKILL without CLONE_PIDFD must fail with EINVAL. + */ +TEST(autokill_requires_pidfd) +{ + struct __clone_args args = { + .flags = CLONE_PIDFD_AUTOKILL | CLONE_AUTOREAP, + .exit_signal = 0, + }; + pid_t pid; + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_EQ(pid, -1); + ASSERT_EQ(errno, EINVAL); +} + +/* + * CLONE_PIDFD_AUTOKILL without CLONE_AUTOREAP must fail with EINVAL. + */ +TEST(autokill_requires_autoreap) +{ + int pidfd = -1; + struct __clone_args args = { + .flags = CLONE_PIDFD | CLONE_PIDFD_AUTOKILL, + .exit_signal = 0, + .pidfd = ptr_to_u64(&pidfd), + }; + pid_t pid; + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_EQ(pid, -1); + ASSERT_EQ(errno, EINVAL); +} + +/* + * CLONE_PIDFD_AUTOKILL with CLONE_THREAD must fail with EINVAL. + */ +TEST(autokill_rejects_thread) +{ + int pidfd = -1; + struct __clone_args args = { + .flags = CLONE_PIDFD | CLONE_PIDFD_AUTOKILL | + CLONE_AUTOREAP | CLONE_THREAD | + CLONE_SIGHAND | CLONE_VM, + .exit_signal = 0, + .pidfd = ptr_to_u64(&pidfd), + }; + pid_t pid; + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_EQ(pid, -1); + ASSERT_EQ(errno, EINVAL); +} + +/* + * Test that only the clone3 pidfd triggers autokill, not pidfd_open(). + * Close the pidfd_open'd fd first — child should survive. + * Then close the clone3 pidfd — child should be killed and autoreaped. + */ +TEST(autokill_pidfd_open_no_effect) +{ + int pidfd = -1, open_fd = -1, ret; + struct pollfd pfd; + pid_t pid; + + pid = create_autokill_child(&pidfd); + if (pid < 0 && errno == EINVAL) + SKIP(return, "CLONE_PIDFD_AUTOKILL not supported"); + ASSERT_GE(pid, 0); + + if (pid == 0) { + pause(); + _exit(1); + } + + ASSERT_GE(pidfd, 0); + + /* Open a second pidfd via pidfd_open(). */ + open_fd = sys_pidfd_open(pid, 0); + ASSERT_GE(open_fd, 0); + + /* + * Close the pidfd_open'd fd — child should survive because + * only the clone3 pidfd has autokill. + */ + close(open_fd); + usleep(200000); + + /* Verify child is still alive by polling the clone3 pidfd. */ + pfd.fd = pidfd; + pfd.events = POLLIN; + ret = poll(&pfd, 1, 0); + ASSERT_EQ(ret, 0) { + TH_LOG("Child died after closing pidfd_open fd — should still be alive"); + } + + /* Open another observation fd before triggering autokill. */ + open_fd = sys_pidfd_open(pid, 0); + ASSERT_GE(open_fd, 0); + + /* Now close the clone3 pidfd — this triggers autokill. */ + close(pidfd); + + pfd.fd = open_fd; + pfd.events = POLLIN; + ret = poll(&pfd, 1, 5000); + ASSERT_EQ(ret, 1); + ASSERT_TRUE(pfd.revents & POLLIN); + + /* Child should be autoreaped — no zombie. */ + usleep(100000); + ret = waitpid(pid, NULL, WNOHANG); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ECHILD); + + close(open_fd); +} + +/* + * Test that CLONE_PIDFD_AUTOKILL without CLONE_NNP fails with EPERM + * for an unprivileged caller. + */ +TEST(autokill_requires_cap_sys_admin) +{ + int pidfd = -1, ret; + struct __clone_args args = { + .flags = CLONE_PIDFD | CLONE_PIDFD_AUTOKILL | + CLONE_AUTOREAP, + .exit_signal = 0, + .pidfd = ptr_to_u64(&pidfd), + }; + pid_t pid; + + /* Drop all capabilities so we lack CAP_SYS_ADMIN. */ + ret = drop_all_caps(); + ASSERT_EQ(ret, 0); + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_EQ(pid, -1); + ASSERT_EQ(errno, EPERM); +} + +/* + * Test that CLONE_PIDFD_AUTOKILL without CLONE_NNP succeeds with + * CAP_SYS_ADMIN. + */ +TEST(autokill_without_nnp_with_cap) +{ + struct __clone_args args = { + .flags = CLONE_PIDFD | CLONE_PIDFD_AUTOKILL | + CLONE_AUTOREAP, + .exit_signal = 0, + }; + struct pidfd_info info = { .mask = PIDFD_INFO_EXIT }; + int pidfd = -1, ret; + struct pollfd pfd; + pid_t pid; + + if (geteuid() != 0) + SKIP(return, "Need root/CAP_SYS_ADMIN"); + + args.pidfd = ptr_to_u64(&pidfd); + + pid = sys_clone3(&args, sizeof(args)); + if (pid < 0 && errno == EINVAL) + SKIP(return, "CLONE_PIDFD_AUTOKILL not supported"); + ASSERT_GE(pid, 0); + + if (pid == 0) + _exit(0); + + ASSERT_GE(pidfd, 0); + + /* Wait for child to exit. */ + pfd.fd = pidfd; + pfd.events = POLLIN; + ret = poll(&pfd, 1, 5000); + ASSERT_EQ(ret, 1); + + ret = ioctl(pidfd, PIDFD_GET_INFO, &info); + ASSERT_EQ(ret, 0); + ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT); + ASSERT_TRUE(WIFEXITED(info.exit_code)); + ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); + + close(pidfd); +} + TEST_HARNESS_MAIN From 3fc66a103395b4ae8d032dcda5621423d94902f6 Mon Sep 17 00:00:00 2001 From: Emanuele Rocca Date: Fri, 20 Mar 2026 20:46:43 +0100 Subject: [PATCH 7/9] kselftest/coredump: reintroduce null pointer dereference Commit 673a55cc49da replaced the null pointer dereference used in crashing_child() with __builtin_trap to address the following LLVM warnings: coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference] coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile' All coredump tests expect crashing_child() to result in a SIGSEGV. However, the behavior of __builtin_trap is architecture-dependent. On x86 it yields SIGILL, on aarch64 SIGTRAP. Given that neither of those signals are SIGSEGV, both coredump_socket_test and coredump_socket_protocol_test are currently failing: get_pidfd_info: mask=0xd7, coredump_mask=0x5, coredump_signal=5 socket_coredump_signal_sigsegv: coredump_signal=5, expected SIGSEGV=11 Qualify the pointer with volatile instead of calling __builtin_trap to fix the tests. Signed-off-by: Emanuele Rocca Link: https://patch.msgid.link/ab2kI0PI_Vk6bU88@NH27D9T0LF Reviewed-by: Mark Brown Signed-off-by: Christian Brauner --- tools/testing/selftests/coredump/coredump_test_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c index 5c8adee63641..2c850e0b1b57 100644 --- a/tools/testing/selftests/coredump/coredump_test_helpers.c +++ b/tools/testing/selftests/coredump/coredump_test_helpers.c @@ -56,7 +56,7 @@ void crashing_child(void) pthread_create(&thread, NULL, do_nothing, NULL); /* crash on purpose */ - __builtin_trap(); + i = *(volatile int *)NULL; } int create_detached_tmpfs(void) From 701f7f4fbabbf4989ba6fbf033b160dd943221d5 Mon Sep 17 00:00:00 2001 From: Emanuele Rocca Date: Mon, 23 Mar 2026 14:02:16 +0100 Subject: [PATCH 8/9] pidfds: add coredump_code field to pidfd_info The struct pidfd_info currently exposes in a field called coredump_signal the signal number (si_signo) that triggered the dump (for example, 11 for SIGSEGV). However, it is also valuable to understand the reason why that signal was sent. This additional context is provided by the signal code (si_code), such as 2 for SEGV_ACCERR. Add a new field to struct pidfd_info called coredump_code with the value of si_code for the benefit of sysadmins who pipe core dumps to user-space programs for later analysis. The following snippet illustrates a simplified C program that consumes coredump_signal and coredump_code, and then logs core dump signals and codes to a file: int pidfd = (int)atoi(argv[1]); struct pidfd_info info = { .mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP, }; if (ioctl(pidfd, PIDFD_GET_INFO, &info) == 0) if (info.mask & PIDFD_INFO_COREDUMP) fprintf(f, "PID=%d, si_signo: %d si_code: %d\n", info.pid, info.coredump_signal, info.coredump_code); Assuming the program is installed under /usr/local/bin/core-logger, core dump processing can be enabled by setting /proc/sys/kernel/core_pattern to '|/usr/local/bin/dumpstuff %F'. systemd-coredump(8) already uses pidfds to process core dumps, and it could be extended to include the values of coredump_code too. Signed-off-by: Emanuele Rocca Link: https://patch.msgid.link/acE52HIFivNZN3nE@NH27D9T0LF Acked-by: Oleg Nesterov Signed-off-by: Christian Brauner --- fs/pidfs.c | 12 ++++++++---- include/uapi/linux/pidfd.h | 4 ++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index a8d1bca0395d..2acf84670578 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -57,6 +57,7 @@ struct pidfs_attr { }; __u32 coredump_mask; __u32 coredump_signal; + __u32 coredump_code; }; static struct rhashtable pidfs_ino_ht; @@ -333,7 +334,8 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags) PIDFD_INFO_EXIT | \ PIDFD_INFO_COREDUMP | \ PIDFD_INFO_SUPPORTED_MASK | \ - PIDFD_INFO_COREDUMP_SIGNAL) + PIDFD_INFO_COREDUMP_SIGNAL | \ + PIDFD_INFO_COREDUMP_CODE) static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) { @@ -347,7 +349,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) const struct cred *c; __u64 mask; - BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER2); + BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER3); if (!uinfo) return -EINVAL; @@ -380,9 +382,10 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (mask & PIDFD_INFO_COREDUMP) { if (test_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask)) { smp_rmb(); - kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL; + kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL | PIDFD_INFO_COREDUMP_CODE; kinfo.coredump_mask = attr->coredump_mask; kinfo.coredump_signal = attr->coredump_signal; + kinfo.coredump_code = attr->coredump_code; } } @@ -755,8 +758,9 @@ void pidfs_coredump(const struct coredump_params *cprm) PIDFD_COREDUMPED; /* If coredumping is set to skip we should never end up here. */ VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP); - /* Expose the signal number that caused the coredump. */ + /* Expose the signal number and code that caused the coredump. */ attr->coredump_signal = cprm->siginfo->si_signo; + attr->coredump_code = cprm->siginfo->si_code; smp_wmb(); set_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask); } diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 9281956a9f32..0919246a1611 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -29,10 +29,12 @@ #define PIDFD_INFO_COREDUMP (1UL << 4) /* Only returned if requested. */ #define PIDFD_INFO_SUPPORTED_MASK (1UL << 5) /* Want/got supported mask flags */ #define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6) /* Always returned if PIDFD_INFO_COREDUMP is requested. */ +#define PIDFD_INFO_COREDUMP_CODE (1UL << 7) /* Always returned if PIDFD_INFO_COREDUMP is requested. */ #define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */ #define PIDFD_INFO_SIZE_VER1 72 /* sizeof second published struct */ #define PIDFD_INFO_SIZE_VER2 80 /* sizeof third published struct */ +#define PIDFD_INFO_SIZE_VER3 88 /* sizeof fourth published struct */ /* * Values for @coredump_mask in pidfd_info. @@ -99,6 +101,8 @@ struct pidfd_info { struct /* coredump info */ { __u32 coredump_mask; __u32 coredump_signal; + __u32 coredump_code; + __u32 coredump_pad; /* align supported_mask to 8 bytes */ }; __u64 supported_mask; /* Mask flags that this kernel supports */ }; From 7aaa4915cb699378db1fa2a5c763ebea2caa35da Mon Sep 17 00:00:00 2001 From: Emanuele Rocca Date: Mon, 23 Mar 2026 14:03:15 +0100 Subject: [PATCH 9/9] selftests: check pidfd_info->coredump_code correctness Extend the coredump_socket and coredump_socket_protocol selftests to verify that the field coredump_code is set as expected in struct pidfd_info. Signed-off-by: Emanuele Rocca Link: https://patch.msgid.link/acE6Eyuv2MM75pmk@NH27D9T0LF Signed-off-by: Christian Brauner --- .../coredump/coredump_socket_protocol_test.c | 26 +++++++++++++++ .../selftests/coredump/coredump_socket_test.c | 32 +++++++++++++++++++ .../coredump/coredump_test_helpers.c | 4 +-- tools/testing/selftests/pidfd/pidfd.h | 5 +++ .../testing/selftests/pidfd/pidfd_info_test.c | 1 + 5 files changed, 66 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/coredump/coredump_socket_protocol_test.c b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c index d19b6717c53e..d9fa6239b5a9 100644 --- a/tools/testing/selftests/coredump/coredump_socket_protocol_test.c +++ b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c @@ -1004,6 +1004,8 @@ TEST_F(coredump, socket_request_invalid_size_large) * * Verify that when using socket-based coredump protocol, * the coredump_signal field is correctly exposed as SIGSEGV. + * Also check that the coredump_code field is correctly exposed + * as SEGV_MAPERR. */ TEST_F(coredump, socket_coredump_signal_sigsegv) { @@ -1079,6 +1081,18 @@ TEST_F(coredump, socket_coredump_signal_sigsegv) goto out; } + /* Verify coredump_code is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_CODE)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_CODE not set in mask\n"); + goto out; + } + + if (info.coredump_code != SEGV_MAPERR) { + fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_code=%d, expected SEGV_MAPERR=%d\n", + info.coredump_code, SEGV_MAPERR); + goto out; + } + if (!read_coredump_req(fd_coredump, &req)) { fprintf(stderr, "socket_coredump_signal_sigsegv: read_coredump_req failed\n"); goto out; @@ -1128,6 +1142,8 @@ TEST_F(coredump, socket_coredump_signal_sigsegv) ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); ASSERT_EQ(info.coredump_signal, SIGSEGV); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_CODE)); + ASSERT_EQ(info.coredump_code, SEGV_MAPERR); wait_and_check_coredump_server(pid_coredump_server, _metadata, self); } @@ -1137,6 +1153,8 @@ TEST_F(coredump, socket_coredump_signal_sigsegv) * * Verify that when using socket-based coredump protocol, * the coredump_signal field is correctly exposed as SIGABRT. + * Also check that the coredump_code field is correctly exposed + * as SI_TKILL. */ TEST_F(coredump, socket_coredump_signal_sigabrt) { @@ -1212,6 +1230,12 @@ TEST_F(coredump, socket_coredump_signal_sigabrt) goto out; } + if (info.coredump_code != SI_TKILL) { + fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_code=%d, expected SI_TKILL=%d\n", + info.coredump_code, SI_TKILL); + goto out; + } + if (!read_coredump_req(fd_coredump, &req)) { fprintf(stderr, "socket_coredump_signal_sigabrt: read_coredump_req failed\n"); goto out; @@ -1261,6 +1285,8 @@ TEST_F(coredump, socket_coredump_signal_sigabrt) ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); ASSERT_EQ(info.coredump_signal, SIGABRT); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_CODE)); + ASSERT_EQ(info.coredump_code, SI_TKILL); wait_and_check_coredump_server(pid_coredump_server, _metadata, self); } diff --git a/tools/testing/selftests/coredump/coredump_socket_test.c b/tools/testing/selftests/coredump/coredump_socket_test.c index 7e26d4a6a15d..422728f632ca 100644 --- a/tools/testing/selftests/coredump/coredump_socket_test.c +++ b/tools/testing/selftests/coredump/coredump_socket_test.c @@ -435,6 +435,8 @@ TEST_F(coredump, socket_no_listener) * * Verify that when using simple socket-based coredump (@ pattern), * the coredump_signal field is correctly exposed as SIGSEGV. + * Also check that the coredump_code field is correctly exposed + * as SEGV_MAPERR. */ TEST_F(coredump, socket_coredump_signal_sigsegv) { @@ -509,6 +511,18 @@ TEST_F(coredump, socket_coredump_signal_sigsegv) goto out; } + /* Verify coredump_code is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_CODE)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_CODE not set in mask\n"); + goto out; + } + + if (info.coredump_code != SEGV_MAPERR) { + fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_code=%d, expected SEGV_MAPERR=%d\n", + info.coredump_code, SEGV_MAPERR); + goto out; + } + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); if (fd_core_file < 0) { fprintf(stderr, "socket_coredump_signal_sigsegv: open_coredump_tmpfile failed: %m\n"); @@ -572,6 +586,8 @@ TEST_F(coredump, socket_coredump_signal_sigsegv) ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); ASSERT_EQ(info.coredump_signal, SIGSEGV); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_CODE)); + ASSERT_EQ(info.coredump_code, SEGV_MAPERR); wait_and_check_coredump_server(pid_coredump_server, _metadata, self); } @@ -581,6 +597,8 @@ TEST_F(coredump, socket_coredump_signal_sigsegv) * * Verify that when using simple socket-based coredump (@ pattern), * the coredump_signal field is correctly exposed as SIGABRT. + * Also check that the coredump_code field is correctly exposed + * as SI_TKILL. */ TEST_F(coredump, socket_coredump_signal_sigabrt) { @@ -655,6 +673,18 @@ TEST_F(coredump, socket_coredump_signal_sigabrt) goto out; } + /* Verify coredump_code is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_CODE)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_CODE not set in mask\n"); + goto out; + } + + if (info.coredump_code != SI_TKILL) { + fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_code=%d, expected SI_TKILL=%d\n", + info.coredump_code, SI_TKILL); + goto out; + } + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); if (fd_core_file < 0) { fprintf(stderr, "socket_coredump_signal_sigabrt: open_coredump_tmpfile failed: %m\n"); @@ -718,6 +748,8 @@ TEST_F(coredump, socket_coredump_signal_sigabrt) ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); ASSERT_EQ(info.coredump_signal, SIGABRT); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_CODE)); + ASSERT_EQ(info.coredump_code, SI_TKILL); wait_and_check_coredump_server(pid_coredump_server, _metadata, self); } diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c index 2c850e0b1b57..2a20faf9cb0a 100644 --- a/tools/testing/selftests/coredump/coredump_test_helpers.c +++ b/tools/testing/selftests/coredump/coredump_test_helpers.c @@ -148,8 +148,8 @@ bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info) fprintf(stderr, "get_pidfd_info: ioctl(PIDFD_GET_INFO) failed: %m\n"); return false; } - fprintf(stderr, "get_pidfd_info: mask=0x%llx, coredump_mask=0x%x, coredump_signal=%d\n", - (unsigned long long)info->mask, info->coredump_mask, info->coredump_signal); + fprintf(stderr, "get_pidfd_info: mask=0x%llx, coredump_mask=0x%x, coredump_signal=%d, coredump_code=%d\n", + (unsigned long long)info->mask, info->coredump_mask, info->coredump_signal, info->coredump_code); return true; } diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index 9085c1a3c005..5a4e78c10f43 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -156,6 +156,10 @@ #define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6) #endif +#ifndef PIDFD_INFO_COREDUMP_CODE +#define PIDFD_INFO_COREDUMP_CODE (1UL << 7) +#endif + #ifndef PIDFD_COREDUMPED #define PIDFD_COREDUMPED (1U << 0) /* Did crash and... */ #endif @@ -194,6 +198,7 @@ struct pidfd_info { struct { __u32 coredump_mask; __u32 coredump_signal; + __u32 coredump_code; }; __u64 supported_mask; }; diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c index 8bed951e06a0..597012ed195f 100644 --- a/tools/testing/selftests/pidfd/pidfd_info_test.c +++ b/tools/testing/selftests/pidfd/pidfd_info_test.c @@ -724,6 +724,7 @@ TEST(supported_mask_field) ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP)); ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_SUPPORTED_MASK)); ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP_CODE)); /* Clean up */ sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);