mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-16 02:01:18 -04:00
Merge tag 'vfs-7.1-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull clone and pidfs updates from Christian Brauner:
"Add three new clone3() flags for pidfd-based process lifecycle
management.
CLONE_AUTOREAP:
CLONE_AUTOREAP makes a child process auto-reap on exit without ever
becoming a zombie. This is a per-process property in contrast to
the existing auto-reap mechanism via SA_NOCLDWAIT or SIG_IGN for
SIGCHLD which applies to all children of a given parent.
Currently the only way to automatically reap children is to set
SA_NOCLDWAIT or SIG_IGN on SIGCHLD. This is a parent-scoped
property affecting all children which makes it unsuitable for
libraries or applications that need selective auto-reaping of
specific children while still being able to wait() on others.
CLONE_AUTOREAP stores an autoreap flag in the child's
signal_struct. When the child exits do_notify_parent() checks this
flag and causes exit_notify() to transition the task directly to
EXIT_DEAD. Since the flag lives on the child it survives
reparenting: if the original parent exits and the child is
reparented to a subreaper or init the child still auto-reaps when
it eventually exits. This is cleaner than forcing the subreaper to
get SIGCHLD and then reaping it. If the parent doesn't care the
subreaper won't care. If there's a subreaper that would care it
would be easy enough to add a prctl() that either just turns back
on SIGCHLD and turns off auto-reaping or a prctl() that just
notifies the subreaper whenever a child is reparented to it.
CLONE_AUTOREAP can be combined with CLONE_PIDFD to allow the parent
to monitor the child's exit via poll() and retrieve exit status via
PIDFD_GET_INFO. Without CLONE_PIDFD it provides a fire-and-forget
pattern. No exit signal is delivered so exit_signal must be zero.
CLONE_THREAD and CLONE_PARENT are rejected: CLONE_THREAD because
autoreap is a process-level property, and CLONE_PARENT because an
autoreap child reparented via CLONE_PARENT could become an
invisible zombie under a parent that never calls wait().
The flag is not inherited by the autoreap process's own children.
Each child that should be autoreaped must be explicitly created
with CLONE_AUTOREAP.
CLONE_NNP:
CLONE_NNP sets no_new_privs on the child at clone time. Unlike
prctl(PR_SET_NO_NEW_PRIVS) which a process sets on itself,
CLONE_NNP allows the parent to impose no_new_privs on the child at
creation without affecting the parent's own privileges.
CLONE_THREAD is rejected because threads share credentials.
CLONE_NNP is useful on its own for any spawn-and-sandbox pattern
but was specifically introduced to enable unprivileged usage of
CLONE_PIDFD_AUTOKILL.
CLONE_PIDFD_AUTOKILL:
This flag ties a child's lifetime to the pidfd returned from
clone3(). When the last reference to the struct file created by
clone3() is closed the kernel sends SIGKILL to the child. A pidfd
obtained via pidfd_open() for the same process does not keep the
child alive and does not trigger autokill - only the specific
struct file from clone3() has this property. This is useful for
container runtimes, service managers, and sandboxed subprocess
execution - any scenario where the child must die if the parent
crashes or abandons the pidfd or just wants a throwaway helper
process.
CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD and CLONE_AUTOREAP.
It requires CLONE_PIDFD because the whole point is tying the
child's lifetime to the pidfd. It requires CLONE_AUTOREAP because a
killed child with no one to reap it would become a zombie - the
primary use case is the parent crashing or abandoning the pidfd so
no one is around to call waitpid(). CLONE_THREAD is rejected
because autokill targets a process not a thread.
If CLONE_NNP is specified together with CLONE_PIDFD_AUTOKILL an
unprivileged user may spawn a process that is autokilled. The child
cannot escalate privileges via setuid/setgid exec after being
spawned. If CLONE_PIDFD_AUTOKILL is specified without CLONE_NNP the
caller must have have CAP_SYS_ADMIN in its user namespace"
* tag 'vfs-7.1-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
selftests: check pidfd_info->coredump_code correctness
pidfds: add coredump_code field to pidfd_info
kselftest/coredump: reintroduce null pointer dereference
selftests/pidfd: add CLONE_PIDFD_AUTOKILL tests
selftests/pidfd: add CLONE_NNP tests
selftests/pidfd: add CLONE_AUTOREAP tests
pidfd: add CLONE_PIDFD_AUTOKILL
clone: add CLONE_NNP
clone: add CLONE_AUTOREAP
This commit is contained in:
@@ -13,6 +13,7 @@
|
||||
#ifdef __KERNEL__
|
||||
#include <linux/sched.h>
|
||||
#define PIDFD_STALE CLONE_PIDFD
|
||||
#define PIDFD_AUTOKILL O_TRUNC
|
||||
#endif
|
||||
|
||||
/* Flags for pidfd_send_signal(). */
|
||||
@@ -28,10 +29,12 @@
|
||||
#define PIDFD_INFO_COREDUMP (1UL << 4) /* Only returned if requested. */
|
||||
#define PIDFD_INFO_SUPPORTED_MASK (1UL << 5) /* Want/got supported mask flags */
|
||||
#define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6) /* Always returned if PIDFD_INFO_COREDUMP is requested. */
|
||||
#define PIDFD_INFO_COREDUMP_CODE (1UL << 7) /* Always returned if PIDFD_INFO_COREDUMP is requested. */
|
||||
|
||||
#define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */
|
||||
#define PIDFD_INFO_SIZE_VER1 72 /* sizeof second published struct */
|
||||
#define PIDFD_INFO_SIZE_VER2 80 /* sizeof third published struct */
|
||||
#define PIDFD_INFO_SIZE_VER3 88 /* sizeof fourth published struct */
|
||||
|
||||
/*
|
||||
* Values for @coredump_mask in pidfd_info.
|
||||
@@ -98,6 +101,8 @@ struct pidfd_info {
|
||||
struct /* coredump info */ {
|
||||
__u32 coredump_mask;
|
||||
__u32 coredump_signal;
|
||||
__u32 coredump_code;
|
||||
__u32 coredump_pad; /* align supported_mask to 8 bytes */
|
||||
};
|
||||
__u64 supported_mask; /* Mask flags that this kernel supports */
|
||||
};
|
||||
|
||||
@@ -34,8 +34,11 @@
|
||||
#define CLONE_IO 0x80000000 /* Clone io context */
|
||||
|
||||
/* Flags for the clone3() syscall. */
|
||||
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
|
||||
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
|
||||
#define CLONE_CLEAR_SIGHAND (1ULL << 32) /* Clear any signal handler and reset to SIG_DFL. */
|
||||
#define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */
|
||||
#define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */
|
||||
#define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */
|
||||
#define CLONE_PIDFD_AUTOKILL (1ULL << 36) /* Kill child when clone pidfd closes. */
|
||||
|
||||
/*
|
||||
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
|
||||
|
||||
Reference in New Issue
Block a user