summaryrefslogtreecommitdiff
path: root/fs/pidfs.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/pidfs.c')
-rw-r--r--fs/pidfs.c395
1 files changed, 343 insertions, 52 deletions
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 63f9699ebac3..c1f0a067be40 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -20,10 +20,34 @@
#include <linux/time_namespace.h>
#include <linux/utsname.h>
#include <net/net_namespace.h>
+#include <linux/coredump.h>
#include "internal.h"
#include "mount.h"
+static struct kmem_cache *pidfs_cachep __ro_after_init;
+
+/*
+ * Stashes information that userspace needs to access even after the
+ * process has been reaped.
+ */
+struct pidfs_exit_info {
+ __u64 cgroupid;
+ __s32 exit_code;
+ __u32 coredump_mask;
+};
+
+struct pidfs_inode {
+ struct pidfs_exit_info __pei;
+ struct pidfs_exit_info *exit_info;
+ struct inode vfs_inode;
+};
+
+static inline struct pidfs_inode *pidfs_i(struct inode *inode)
+{
+ return container_of(inode, struct pidfs_inode, vfs_inode);
+}
+
static struct rb_root pidfs_ino_tree = RB_ROOT;
#if BITS_PER_LONG == 32
@@ -188,36 +212,64 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
struct pid *pid = pidfd_pid(file);
- bool thread = file->f_flags & PIDFD_THREAD;
struct task_struct *task;
__poll_t poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts);
/*
- * Depending on PIDFD_THREAD, inform pollers when the thread
- * or the whole thread-group exits.
+ * Don't wake waiters if the thread-group leader exited
+ * prematurely. They either get notified when the last subthread
+ * exits or not at all if one of the remaining subthreads execs
+ * and assumes the struct pid of the old thread-group leader.
*/
guard(rcu)();
task = pid_task(pid, PIDTYPE_PID);
if (!task)
poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
- else if (task->exit_state && (thread || thread_group_empty(task)))
+ else if (task->exit_state && !delay_group_leader(task))
poll_flags = EPOLLIN | EPOLLRDNORM;
return poll_flags;
}
-static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
+static inline bool pid_in_current_pidns(const struct pid *pid)
+{
+ const struct pid_namespace *ns = task_active_pid_ns(current);
+
+ if (ns->level <= pid->level)
+ return pid->numbers[ns->level].ns == ns;
+
+ return false;
+}
+
+static __u32 pidfs_coredump_mask(unsigned long mm_flags)
+{
+ switch (__get_dumpable(mm_flags)) {
+ case SUID_DUMP_USER:
+ return PIDFD_COREDUMP_USER;
+ case SUID_DUMP_ROOT:
+ return PIDFD_COREDUMP_ROOT;
+ case SUID_DUMP_DISABLE:
+ return PIDFD_COREDUMP_SKIP;
+ default:
+ WARN_ON_ONCE(true);
+ }
+
+ return 0;
+}
+
+static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
+ struct inode *inode = file_inode(file);
+ struct pid *pid = pidfd_pid(file);
size_t usize = _IOC_SIZE(cmd);
struct pidfd_info kinfo = {};
+ struct pidfs_exit_info *exit_info;
struct user_namespace *user_ns;
+ struct task_struct *task;
const struct cred *c;
__u64 mask;
-#ifdef CONFIG_CGROUPS
- struct cgroup *cgrp;
-#endif
if (!uinfo)
return -EINVAL;
@@ -227,10 +279,53 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
return -EFAULT;
+ /*
+ * Restrict information retrieval to tasks within the caller's pid
+ * namespace hierarchy.
+ */
+ if (!pid_in_current_pidns(pid))
+ return -ESRCH;
+
+ if (mask & PIDFD_INFO_EXIT) {
+ exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
+ if (exit_info) {
+ kinfo.mask |= PIDFD_INFO_EXIT;
+#ifdef CONFIG_CGROUPS
+ kinfo.cgroupid = exit_info->cgroupid;
+ kinfo.mask |= PIDFD_INFO_CGROUPID;
+#endif
+ kinfo.exit_code = exit_info->exit_code;
+ }
+ }
+
+ if (mask & PIDFD_INFO_COREDUMP) {
+ kinfo.mask |= PIDFD_INFO_COREDUMP;
+ kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
+ }
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ /*
+ * If the task has already been reaped, only exit
+ * information is available
+ */
+ if (!(mask & PIDFD_INFO_EXIT))
+ return -ESRCH;
+
+ goto copy_out;
+ }
+
c = get_task_cred(task);
if (!c)
return -ESRCH;
+ if (!(kinfo.mask & PIDFD_INFO_COREDUMP)) {
+ task_lock(task);
+ if (task->mm)
+ kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags);
+ task_unlock(task);
+ }
+
/* Unconditionally return identifiers and credentials, the rest only on request */
user_ns = current_user_ns();
@@ -246,11 +341,15 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
put_cred(c);
#ifdef CONFIG_CGROUPS
- rcu_read_lock();
- cgrp = task_dfl_cgroup(task);
- kinfo.cgroupid = cgroup_id(cgrp);
- kinfo.mask |= PIDFD_INFO_CGROUPID;
- rcu_read_unlock();
+ if (!kinfo.cgroupid) {
+ struct cgroup *cgrp;
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(task);
+ kinfo.cgroupid = cgroup_id(cgrp);
+ kinfo.mask |= PIDFD_INFO_CGROUPID;
+ rcu_read_unlock();
+ }
#endif
/*
@@ -270,16 +369,14 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
return -ESRCH;
+copy_out:
/*
* If userspace and the kernel have the same struct size it can just
* be copied. If userspace provides an older struct, only the bits that
* userspace knows about will be copied. If userspace provides a new
* struct, only the bits that the kernel knows about will be copied.
*/
- if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo))))
- return -EFAULT;
-
- return 0;
+ return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL);
}
static bool pidfs_ioctl_valid(unsigned int cmd)
@@ -317,7 +414,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct task_struct *task __free(put_task) = NULL;
struct nsproxy *nsp __free(put_nsproxy) = NULL;
- struct pid *pid = pidfd_pid(file);
struct ns_common *ns_common = NULL;
struct pid_namespace *pid_ns;
@@ -332,13 +428,13 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return put_user(file_inode(file)->i_generation, argp);
}
- task = get_pid_task(pid, PIDTYPE_PID);
- if (!task)
- return -ESRCH;
-
/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
- return pidfd_info(task, cmd, arg);
+ return pidfd_info(file, cmd, arg);
+
+ task = get_pid_task(pidfd_pid(file), PIDTYPE_PID);
+ if (!task)
+ return -ESRCH;
if (arg)
return -EINVAL;
@@ -450,6 +546,74 @@ struct pid *pidfd_pid(const struct file *file)
return file_inode(file)->i_private;
}
+/*
+ * We're called from release_task(). We know there's at least one
+ * reference to struct pid being held that won't be released until the
+ * task has been reaped which cannot happen until we're out of
+ * release_task().
+ *
+ * If this struct pid is referred to by a pidfd then
+ * stashed_dentry_get() will return the dentry and inode for that struct
+ * pid. Since we've taken a reference on it there's now an additional
+ * reference from the exit path on it. Which is fine. We're going to put
+ * it again in a second and we know that the pid is kept alive anyway.
+ *
+ * Worst case is that we've filled in the info and immediately free the
+ * dentry and inode afterwards since the pidfd has been closed. Since
+ * pidfs_exit() currently is placed after exit_task_work() we know that
+ * it cannot be us aka the exiting task holding a pidfd to ourselves.
+ */
+void pidfs_exit(struct task_struct *tsk)
+{
+ struct dentry *dentry;
+
+ might_sleep();
+
+ dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
+ if (dentry) {
+ struct inode *inode = d_inode(dentry);
+ struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgrp;
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(tsk);
+ exit_info->cgroupid = cgroup_id(cgrp);
+ rcu_read_unlock();
+#endif
+ exit_info->exit_code = tsk->exit_code;
+
+ /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
+ smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
+ dput(dentry);
+ }
+}
+
+#ifdef CONFIG_COREDUMP
+void pidfs_coredump(const struct coredump_params *cprm)
+{
+ struct pid *pid = cprm->pid;
+ struct pidfs_exit_info *exit_info;
+ struct dentry *dentry;
+ struct inode *inode;
+ __u32 coredump_mask = 0;
+
+ dentry = pid->stashed;
+ if (WARN_ON_ONCE(!dentry))
+ return;
+
+ inode = d_inode(dentry);
+ exit_info = &pidfs_i(inode)->__pei;
+ /* Note how we were coredumped. */
+ coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
+ /* Note that we actually did coredump. */
+ coredump_mask |= PIDFD_COREDUMPED;
+ /* If coredumping is set to skip we should never end up here. */
+ VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP);
+ smp_store_release(&exit_info->coredump_mask, coredump_mask);
+}
+#endif
+
static struct vfsmount *pidfs_mnt __ro_after_init;
/*
@@ -460,36 +624,14 @@ static struct vfsmount *pidfs_mnt __ro_after_init;
static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr)
{
- return -EOPNOTSUPP;
+ return anon_inode_setattr(idmap, dentry, attr);
}
-
-/*
- * User space expects pidfs inodes to have no file type in st_mode.
- *
- * In particular, 'lsof' has this legacy logic:
- *
- * type = s->st_mode & S_IFMT;
- * switch (type) {
- * ...
- * case 0:
- * if (!strcmp(p, "anon_inode"))
- * Lf->ntype = Ntype = N_ANON_INODE;
- *
- * to detect our old anon_inode logic.
- *
- * Rather than mess with our internal sane inode data, just fix it
- * up here in getattr() by masking off the format bits.
- */
static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags)
{
- struct inode *inode = d_inode(path->dentry);
-
- generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
- stat->mode &= ~S_IFMT;
- return 0;
+ return anon_inode_getattr(idmap, path, stat, request_mask, query_flags);
}
static const struct inode_operations pidfs_inode_operations = {
@@ -505,9 +647,30 @@ static void pidfs_evict_inode(struct inode *inode)
put_pid(pid);
}
+static struct inode *pidfs_alloc_inode(struct super_block *sb)
+{
+ struct pidfs_inode *pi;
+
+ pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL);
+ if (!pi)
+ return NULL;
+
+ memset(&pi->__pei, 0, sizeof(pi->__pei));
+ pi->exit_info = NULL;
+
+ return &pi->vfs_inode;
+}
+
+static void pidfs_free_inode(struct inode *inode)
+{
+ kmem_cache_free(pidfs_cachep, pidfs_i(inode));
+}
+
static const struct super_operations pidfs_sops = {
+ .alloc_inode = pidfs_alloc_inode,
.drop_inode = generic_delete_inode,
.evict_inode = pidfs_evict_inode,
+ .free_inode = pidfs_free_inode,
.statfs = simple_statfs,
};
@@ -521,7 +684,6 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
}
const struct dentry_operations pidfs_dentry_operations = {
- .d_delete = always_delete_dentry,
.d_dname = pidfs_dname,
.d_prune = stashed_dentry_prune,
};
@@ -634,8 +796,53 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
return 0;
}
+static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
+ unsigned int flags)
+{
+ enum pid_type type;
+
+ if (flags & PIDFD_STALE)
+ return true;
+
+ /*
+ * Make sure that if a pidfd is created PIDFD_INFO_EXIT
+ * information will be available. So after an inode for the
+ * pidfd has been allocated perform another check that the pid
+ * is still alive. If it is exit information is available even
+ * if the task gets reaped before the pidfd is returned to
+ * userspace. The only exception are indicated by PIDFD_STALE:
+ *
+ * (1) The kernel is in the middle of task creation and thus no
+ * task linkage has been established yet.
+ * (2) The caller knows @pid has been registered in pidfs at a
+ * time when the task was still alive.
+ *
+ * In both cases exit information will have been reported.
+ */
+ if (flags & PIDFD_THREAD)
+ type = PIDTYPE_PID;
+ else
+ type = PIDTYPE_TGID;
+
+ /*
+ * Since pidfs_exit() is called before struct pid's task linkage
+ * is removed the case where the task got reaped but a dentry
+ * was already attached to struct pid and exit information was
+ * recorded and published can be handled correctly.
+ */
+ if (unlikely(!pid_has_task(pid, type))) {
+ struct inode *inode = d_inode(path->dentry);
+ return !!READ_ONCE(pidfs_i(inode)->exit_info);
+ }
+
+ return true;
+}
+
static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
{
+ if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags))
+ return ERR_PTR(-ESRCH);
+
/*
* Clear O_LARGEFILE as open_by_handle_at() forces it and raise
* O_RDWR as pidfds always are.
@@ -656,7 +863,7 @@ static int pidfs_init_inode(struct inode *inode, void *data)
const struct pid *pid = data;
inode->i_private = data;
- inode->i_flags |= S_PRIVATE;
+ inode->i_flags |= S_PRIVATE | S_ANON_INODE;
inode->i_mode |= S_IRWXU;
inode->i_op = &pidfs_inode_operations;
inode->i_fop = &pidfs_file_operations;
@@ -699,22 +906,106 @@ static struct file_system_type pidfs_type = {
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
{
-
struct file *pidfd_file;
- struct path path;
+ struct path path __free(path_put) = {};
int ret;
+ /*
+ * Ensure that PIDFD_STALE can be passed as a flag without
+ * overloading other uapi pidfd flags.
+ */
+ BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
+ BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
+
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
if (ret < 0)
return ERR_PTR(ret);
+ if (!pidfs_pid_valid(pid, &path, flags))
+ return ERR_PTR(-ESRCH);
+
+ flags &= ~PIDFD_STALE;
+ flags |= O_RDWR;
pidfd_file = dentry_open(&path, flags, current_cred());
- path_put(&path);
+ /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
+ if (!IS_ERR(pidfd_file))
+ pidfd_file->f_flags |= (flags & PIDFD_THREAD);
+
return pidfd_file;
}
+/**
+ * pidfs_register_pid - register a struct pid in pidfs
+ * @pid: pid to pin
+ *
+ * Register a struct pid in pidfs. Needs to be paired with
+ * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
+ *
+ * Return: On success zero, on error a negative error code is returned.
+ */
+int pidfs_register_pid(struct pid *pid)
+{
+ struct path path __free(path_put) = {};
+ int ret;
+
+ might_sleep();
+
+ if (!pid)
+ return 0;
+
+ ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
+ if (unlikely(ret))
+ return ret;
+ /* Keep the dentry and only put the reference to the mount. */
+ path.dentry = NULL;
+ return 0;
+}
+
+/**
+ * pidfs_get_pid - pin a struct pid through pidfs
+ * @pid: pid to pin
+ *
+ * Similar to pidfs_register_pid() but only valid if the caller knows
+ * there's a reference to the @pid through a dentry already that can't
+ * go away.
+ */
+void pidfs_get_pid(struct pid *pid)
+{
+ if (!pid)
+ return;
+ WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed));
+}
+
+/**
+ * pidfs_put_pid - drop a pidfs reference
+ * @pid: pid to drop
+ *
+ * Drop a reference to @pid via pidfs. This is only safe if the
+ * reference has been taken via pidfs_get_pid().
+ */
+void pidfs_put_pid(struct pid *pid)
+{
+ might_sleep();
+
+ if (!pid)
+ return;
+ VFS_WARN_ON_ONCE(!pid->stashed);
+ dput(pid->stashed);
+}
+
+static void pidfs_inode_init_once(void *data)
+{
+ struct pidfs_inode *pi = data;
+
+ inode_init_once(&pi->vfs_inode);
+}
+
void __init pidfs_init(void)
{
+ pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0,
+ (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT | SLAB_PANIC),
+ pidfs_inode_init_once);
pidfs_mnt = kern_mount(&pidfs_type);
if (IS_ERR(pidfs_mnt))
panic("Failed to mount pidfs pseudo filesystem");