summaryrefslogtreecommitdiff
path: root/fs/namespace.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c382
1 files changed, 211 insertions, 171 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index d9ca80dcc544..54c59e091919 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -355,12 +355,13 @@ static struct mount *alloc_vfsmnt(const char *name)
if (err)
goto out_free_cache;
- if (name) {
+ if (name)
mnt->mnt_devname = kstrdup_const(name,
GFP_KERNEL_ACCOUNT);
- if (!mnt->mnt_devname)
- goto out_free_id;
- }
+ else
+ mnt->mnt_devname = "none";
+ if (!mnt->mnt_devname)
+ goto out_free_id;
#ifdef CONFIG_SMP
mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
@@ -787,15 +788,11 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
return 0;
mnt = real_mount(bastard);
mnt_add_count(mnt, 1);
- smp_mb(); // see mntput_no_expire()
+ smp_mb(); // see mntput_no_expire() and do_umount()
if (likely(!read_seqretry(&mount_lock, seq)))
return 0;
- if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
- mnt_add_count(mnt, -1);
- return 1;
- }
lock_mount_hash();
- if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
+ if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) {
mnt_add_count(mnt, -1);
unlock_mount_hash();
return 1;
@@ -1268,7 +1265,7 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
if (!fc->root)
return ERR_PTR(-EINVAL);
- mnt = alloc_vfsmnt(fc->source ?: "none");
+ mnt = alloc_vfsmnt(fc->source);
if (!mnt)
return ERR_PTR(-ENOMEM);
@@ -1329,21 +1326,6 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type,
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);
-struct vfsmount *
-vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
- const char *name, void *data)
-{
- /* Until it is worked out how to pass the user namespace
- * through from the parent mount to the submount don't support
- * unprivileged mounts with submounts.
- */
- if (mountpoint->d_sb->s_user_ns != &init_user_ns)
- return ERR_PTR(-EPERM);
-
- return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
-}
-EXPORT_SYMBOL_GPL(vfs_submount);
-
static struct mount *clone_mnt(struct mount *old, struct dentry *root,
int flag)
{
@@ -2048,6 +2030,7 @@ static int do_umount(struct mount *mnt, int flags)
umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
} else {
+ smp_mb(); // paired with __legitimize_mnt()
shrink_submounts(mnt);
retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) {
@@ -2327,21 +2310,62 @@ out:
return dst_mnt;
}
-/* Caller should check returned pointer for errors */
+static inline bool extend_array(struct path **res, struct path **to_free,
+ unsigned n, unsigned *count, unsigned new_count)
+{
+ struct path *p;
+
+ if (likely(n < *count))
+ return true;
+ p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL);
+ if (p && *count)
+ memcpy(p, *res, *count * sizeof(struct path));
+ *count = new_count;
+ kfree(*to_free);
+ *to_free = *res = p;
+ return p;
+}
-struct vfsmount *collect_mounts(const struct path *path)
+struct path *collect_paths(const struct path *path,
+ struct path *prealloc, unsigned count)
{
- struct mount *tree;
- namespace_lock();
- if (!check_mnt(real_mount(path->mnt)))
- tree = ERR_PTR(-EINVAL);
- else
- tree = copy_tree(real_mount(path->mnt), path->dentry,
- CL_COPY_ALL | CL_PRIVATE);
- namespace_unlock();
- if (IS_ERR(tree))
- return ERR_CAST(tree);
- return &tree->mnt;
+ struct mount *root = real_mount(path->mnt);
+ struct mount *child;
+ struct path *res = prealloc, *to_free = NULL;
+ unsigned n = 0;
+
+ guard(rwsem_read)(&namespace_sem);
+
+ if (!check_mnt(root))
+ return ERR_PTR(-EINVAL);
+ if (!extend_array(&res, &to_free, 0, &count, 32))
+ return ERR_PTR(-ENOMEM);
+ res[n++] = *path;
+ list_for_each_entry(child, &root->mnt_mounts, mnt_child) {
+ if (!is_subdir(child->mnt_mountpoint, path->dentry))
+ continue;
+ for (struct mount *m = child; m; m = next_mnt(m, child)) {
+ if (!extend_array(&res, &to_free, n, &count, 2 * count))
+ return ERR_PTR(-ENOMEM);
+ res[n].mnt = &m->mnt;
+ res[n].dentry = m->mnt.mnt_root;
+ n++;
+ }
+ }
+ if (!extend_array(&res, &to_free, n, &count, count + 1))
+ return ERR_PTR(-ENOMEM);
+ memset(res + n, 0, (count - n) * sizeof(struct path));
+ for (struct path *p = res; p->mnt; p++)
+ path_get(p);
+ return res;
+}
+
+void drop_collected_paths(struct path *paths, struct path *prealloc)
+{
+ for (struct path *p = paths; p->mnt; p++)
+ path_put(p);
+ if (paths != prealloc)
+ kfree(paths);
}
static void free_mnt_ns(struct mnt_namespace *);
@@ -2418,16 +2442,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
free_mnt_ns(ns);
}
-void drop_collected_mounts(struct vfsmount *mnt)
-{
- namespace_lock();
- lock_mount_hash();
- umount_tree(real_mount(mnt), 0);
- unlock_mount_hash();
- namespace_unlock();
-}
-
-bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
{
struct mount *child;
@@ -2441,6 +2456,16 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
return false;
}
+bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+{
+ bool res;
+
+ read_seqlock_excl(&mount_lock);
+ res = __has_locked_children(mnt, dentry);
+ read_sequnlock_excl(&mount_lock);
+ return res;
+}
+
/*
* Check that there aren't references to earlier/same mount namespaces in the
* specified subtree. Such references can act as pins for mount namespaces
@@ -2485,23 +2510,27 @@ struct vfsmount *clone_private_mount(const struct path *path)
if (IS_MNT_UNBINDABLE(old_mnt))
return ERR_PTR(-EINVAL);
- if (mnt_has_parent(old_mnt)) {
- if (!check_mnt(old_mnt))
- return ERR_PTR(-EINVAL);
- } else {
- if (!is_mounted(&old_mnt->mnt))
- return ERR_PTR(-EINVAL);
-
- /* Make sure this isn't something purely kernel internal. */
- if (!is_anon_ns(old_mnt->mnt_ns))
+ /*
+ * Make sure the source mount is acceptable.
+ * Anything mounted in our mount namespace is allowed.
+ * Otherwise, it must be the root of an anonymous mount
+ * namespace, and we need to make sure no namespace
+ * loops get created.
+ */
+ if (!check_mnt(old_mnt)) {
+ if (!is_mounted(&old_mnt->mnt) ||
+ !is_anon_ns(old_mnt->mnt_ns) ||
+ mnt_has_parent(old_mnt))
return ERR_PTR(-EINVAL);
- /* Make sure we don't create mount namespace loops. */
if (!check_for_nsfs_mounts(old_mnt))
return ERR_PTR(-EINVAL);
}
- if (has_locked_children(old_mnt, path->dentry))
+ if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ if (__has_locked_children(old_mnt, path->dentry))
return ERR_PTR(-EINVAL);
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
@@ -2514,21 +2543,6 @@ struct vfsmount *clone_private_mount(const struct path *path)
}
EXPORT_SYMBOL_GPL(clone_private_mount);
-int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
- struct vfsmount *root)
-{
- struct mount *mnt;
- int res = f(root, arg);
- if (res)
- return res;
- list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
- res = f(&mnt->mnt, arg);
- if (res)
- return res;
- }
- return 0;
-}
-
static void lock_mnt_tree(struct mount *mnt)
{
struct mount *p;
@@ -2754,14 +2768,14 @@ static int attach_recursive_mnt(struct mount *source_mnt,
hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
struct mount *q;
hlist_del_init(&child->mnt_hash);
- q = __lookup_mnt(&child->mnt_parent->mnt,
- child->mnt_mountpoint);
- if (q)
- mnt_change_mountpoint(child, smp, q);
/* Notice when we are propagating across user namespaces */
if (child->mnt_parent->mnt_ns->user_ns != user_ns)
lock_mnt_tree(child);
child->mnt.mnt_flags &= ~MNT_LOCKED;
+ q = __lookup_mnt(&child->mnt_parent->mnt,
+ child->mnt_mountpoint);
+ if (q)
+ mnt_change_mountpoint(child, smp, q);
commit_tree(child);
}
put_mountpoint(smp);
@@ -2826,56 +2840,62 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
struct vfsmount *mnt = path->mnt;
struct dentry *dentry;
struct mountpoint *mp = ERR_PTR(-ENOENT);
+ struct path under = {};
for (;;) {
- struct mount *m;
+ struct mount *m = real_mount(mnt);
if (beneath) {
- m = real_mount(mnt);
+ path_put(&under);
read_seqlock_excl(&mount_lock);
- dentry = dget(m->mnt_mountpoint);
+ under.mnt = mntget(&m->mnt_parent->mnt);
+ under.dentry = dget(m->mnt_mountpoint);
read_sequnlock_excl(&mount_lock);
+ dentry = under.dentry;
} else {
dentry = path->dentry;
}
inode_lock(dentry->d_inode);
- if (unlikely(cant_mount(dentry))) {
- inode_unlock(dentry->d_inode);
- goto out;
- }
-
namespace_lock();
- if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
+ if (unlikely(cant_mount(dentry) || !is_mounted(mnt)))
+ break; // not to be mounted on
+
+ if (beneath && unlikely(m->mnt_mountpoint != dentry ||
+ &m->mnt_parent->mnt != under.mnt)) {
namespace_unlock();
inode_unlock(dentry->d_inode);
- goto out;
+ continue; // got moved
}
mnt = lookup_mnt(path);
- if (likely(!mnt))
+ if (unlikely(mnt)) {
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
+ path_put(path);
+ path->mnt = mnt;
+ path->dentry = dget(mnt->mnt_root);
+ continue; // got overmounted
+ }
+ mp = get_mountpoint(dentry);
+ if (IS_ERR(mp))
break;
-
- namespace_unlock();
- inode_unlock(dentry->d_inode);
- if (beneath)
- dput(dentry);
- path_put(path);
- path->mnt = mnt;
- path->dentry = dget(mnt->mnt_root);
- }
-
- mp = get_mountpoint(dentry);
- if (IS_ERR(mp)) {
- namespace_unlock();
- inode_unlock(dentry->d_inode);
+ if (beneath) {
+ /*
+ * @under duplicates the references that will stay
+ * at least until namespace_unlock(), so the path_put()
+ * below is safe (and OK to do under namespace_lock -
+ * we are not dropping the final references here).
+ */
+ path_put(&under);
+ }
+ return mp;
}
-
-out:
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
if (beneath)
- dput(dentry);
-
+ path_put(&under);
return mp;
}
@@ -2886,14 +2906,11 @@ static inline struct mountpoint *lock_mount(struct path *path)
static void unlock_mount(struct mountpoint *where)
{
- struct dentry *dentry = where->m_dentry;
-
+ inode_unlock(where->m_dentry->d_inode);
read_seqlock_excl(&mount_lock);
put_mountpoint(where);
read_sequnlock_excl(&mount_lock);
-
namespace_unlock();
- inode_unlock(dentry->d_inode);
}
static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
@@ -2944,6 +2961,10 @@ static int do_change_type(struct path *path, int ms_flags)
return -EINVAL;
namespace_lock();
+ if (!check_mnt(mnt)) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
if (type == MS_SHARED) {
err = invent_group_ids(mnt, recurse);
if (err)
@@ -3035,7 +3056,7 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
if (!may_copy_tree(old_path))
return mnt;
- if (!recurse && has_locked_children(old, old_path->dentry))
+ if (!recurse && __has_locked_children(old, old_path->dentry))
return mnt;
if (recurse)
@@ -3428,7 +3449,7 @@ static int do_set_group(struct path *from_path, struct path *to_path)
goto out;
/* From mount should not have locked children in place of To's root */
- if (has_locked_children(from, to->mnt.mnt_root))
+ if (__has_locked_children(from, to->mnt.mnt_root))
goto out;
/* Setting sharing groups is only allowed on private mounts */
@@ -3442,7 +3463,7 @@ static int do_set_group(struct path *from_path, struct path *to_path)
if (IS_MNT_SLAVE(from)) {
struct mount *m = from->mnt_master;
- list_add(&to->mnt_slave, &m->mnt_slave_list);
+ list_add(&to->mnt_slave, &from->mnt_slave);
to->mnt_master = m;
}
@@ -3467,18 +3488,25 @@ out:
* Check if path is overmounted, i.e., if there's a mount on top of
* @path->mnt with @path->dentry as mountpoint.
*
- * Context: This function expects namespace_lock() to be held.
+ * Context: namespace_sem must be held at least shared.
+ * MUST NOT be called under lock_mount_hash() (there one should just
+ * call __lookup_mnt() and check if it returns NULL).
* Return: If path is overmounted true is returned, false if not.
*/
static inline bool path_overmounted(const struct path *path)
{
+ unsigned seq = read_seqbegin(&mount_lock);
+ bool no_child;
+
rcu_read_lock();
- if (unlikely(__lookup_mnt(path->mnt, path->dentry))) {
- rcu_read_unlock();
- return true;
- }
+ no_child = !__lookup_mnt(path->mnt, path->dentry);
rcu_read_unlock();
- return false;
+ if (need_seqretry(&mount_lock, seq)) {
+ read_seqlock_excl(&mount_lock);
+ no_child = !__lookup_mnt(path->mnt, path->dentry);
+ read_sequnlock_excl(&mount_lock);
+ }
+ return unlikely(!no_child);
}
/**
@@ -3557,7 +3585,8 @@ static int can_move_mount_beneath(const struct path *from,
* @mnt_from itself. This defeats the whole purpose of mounting
* @mnt_from beneath @mnt_to.
*/
- if (propagation_would_overmount(parent_mnt_to, mnt_from, mp))
+ if (check_mnt(mnt_from) &&
+ propagation_would_overmount(parent_mnt_to, mnt_from, mp))
return -EINVAL;
return 0;
@@ -3636,46 +3665,41 @@ static int do_move_mount(struct path *old_path,
ns = old->mnt_ns;
err = -EINVAL;
- if (!may_use_mount(p))
- goto out;
-
/* The thing moved must be mounted... */
if (!is_mounted(&old->mnt))
goto out;
- /* ... and either ours or the root of anon namespace */
- if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
- goto out;
-
- if (is_anon_ns(ns)) {
+ if (check_mnt(old)) {
+ /* if the source is in our namespace... */
+ /* ... it should be detachable from parent */
+ if (!mnt_has_parent(old) || IS_MNT_LOCKED(old))
+ goto out;
+ /* ... and the target should be in our namespace */
+ if (!check_mnt(p))
+ goto out;
+ } else {
/*
- * Ending up with two files referring to the root of the
- * same anonymous mount namespace would cause an error
- * as this would mean trying to move the same mount
- * twice into the mount tree which would be rejected
- * later. But be explicit about it right here.
+ * otherwise the source must be the root of some anon namespace.
+ * AV: check for mount being root of an anon namespace is worth
+ * an inlined predicate...
*/
- if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns))
+ if (!is_anon_ns(ns) || mnt_has_parent(old))
goto out;
-
/*
- * If this is an anonymous mount tree ensure that mount
- * propagation can detect mounts that were just
- * propagated to the target mount tree so we don't
- * propagate onto them.
+ * Bail out early if the target is within the same namespace -
+ * subsequent checks would've rejected that, but they lose
+ * some corner cases if we check it early.
*/
- ns->mntns_flags |= MNTNS_PROPAGATING;
- } else if (is_anon_ns(p->mnt_ns)) {
+ if (ns == p->mnt_ns)
+ goto out;
/*
- * Don't allow moving an attached mount tree to an
- * anonymous mount tree.
+ * Target should be either in our namespace or in an acceptable
+ * anon namespace, sensu check_anonymous_mnt().
*/
- goto out;
+ if (!may_use_mount(p))
+ goto out;
}
- if (old->mnt.mnt_flags & MNT_LOCKED)
- goto out;
-
if (!path_mounted(old_path))
goto out;
@@ -3715,9 +3739,6 @@ static int do_move_mount(struct path *old_path,
if (err)
goto out;
- if (is_anon_ns(ns))
- ns->mntns_flags &= ~MNTNS_PROPAGATING;
-
/* if the mount is moved, it should no longer be expire
* automatically */
list_del_init(&old->mnt_expire);
@@ -3899,10 +3920,6 @@ int finish_automount(struct vfsmount *m, const struct path *path)
return PTR_ERR(m);
mnt = real_mount(m);
- /* The new mount record should have at least 2 refs to prevent it being
- * expired before we get a chance to add it
- */
- BUG_ON(mnt_get_count(mnt) < 2);
if (m->mnt_sb == path->mnt->mnt_sb &&
m->mnt_root == dentry) {
@@ -3935,7 +3952,6 @@ int finish_automount(struct vfsmount *m, const struct path *path)
unlock_mount(mp);
if (unlikely(err))
goto discard;
- mntput(m);
return 0;
discard_locked:
@@ -3949,7 +3965,6 @@ discard:
namespace_unlock();
}
mntput(m);
- mntput(m);
return err;
}
@@ -3986,11 +4001,14 @@ void mark_mounts_for_expiry(struct list_head *mounts)
/* extract from the expiration list every vfsmount that matches the
* following criteria:
+ * - already mounted
* - only referenced by its parent vfsmount
* - still marked for expiry (marked on the last call here; marks are
* cleared by mntput())
*/
list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
+ if (!is_mounted(&mnt->mnt))
+ continue;
if (!xchg(&mnt->mnt_expiry_mark, 1) ||
propagate_mount_busy(mnt, 1))
continue;
@@ -5289,16 +5307,12 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
kattr.kflags |= MOUNT_KATTR_RECURSE;
ret = wants_mount_setattr(uattr, usize, &kattr);
- if (ret < 0)
- return ret;
-
- if (ret) {
+ if (ret > 0) {
ret = do_mount_setattr(&file->f_path, &kattr);
- if (ret)
- return ret;
-
finish_mount_kattr(&kattr);
}
+ if (ret)
+ return ret;
}
fd = get_unused_fd_flags(flags & O_CLOEXEC);
@@ -5491,7 +5505,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
seq->buf[seq->count] = '\0';
seq->count = start;
seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
- } else if (r->mnt_devname) {
+ } else {
seq_puts(seq, r->mnt_devname);
}
return 0;
@@ -5804,7 +5818,9 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
STATMOUNT_SB_SOURCE | \
STATMOUNT_OPT_ARRAY | \
STATMOUNT_OPT_SEC_ARRAY | \
- STATMOUNT_SUPPORTED_MASK)
+ STATMOUNT_SUPPORTED_MASK | \
+ STATMOUNT_MNT_UIDMAP | \
+ STATMOUNT_MNT_GIDMAP)
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
struct mnt_namespace *ns)
@@ -5839,13 +5855,29 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
return err;
s->root = root;
- s->idmap = mnt_idmap(s->mnt);
- if (s->mask & STATMOUNT_SB_BASIC)
- statmount_sb_basic(s);
+ /*
+ * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap
+ * can change concurrently as we only hold the read-side of the
+ * namespace semaphore and mount properties may change with only
+ * the mount lock held.
+ *
+ * We could sample the mount lock sequence counter to detect
+ * those changes and retry. But it's not worth it. Worst that
+ * happens is that the mnt->mnt_idmap pointer is already changed
+ * while mnt->mnt_flags isn't or vica versa. So what.
+ *
+ * Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved
+ * via READ_ONCE()/WRITE_ONCE() and guard against theoretical
+ * torn read/write. That's all we care about right now.
+ */
+ s->idmap = mnt_idmap(s->mnt);
if (s->mask & STATMOUNT_MNT_BASIC)
statmount_mnt_basic(s);
+ if (s->mask & STATMOUNT_SB_BASIC)
+ statmount_sb_basic(s);
+
if (s->mask & STATMOUNT_PROPAGATE_FROM)
statmount_propagate_from(s);
@@ -6157,6 +6189,10 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -ENOENT;
+ /*
+ * We only need to guard against mount topology changes as
+ * listmount() doesn't care about any mount properties.
+ */
scoped_guard(rwsem_read, &namespace_sem)
ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids,
nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
@@ -6239,7 +6275,11 @@ void put_mnt_ns(struct mnt_namespace *ns)
{
if (!refcount_dec_and_test(&ns->ns.count))
return;
- drop_collected_mounts(&ns->root->mnt);
+ namespace_lock();
+ lock_mount_hash();
+ umount_tree(ns->root, 0);
+ unlock_mount_hash();
+ namespace_unlock();
free_mnt_ns(ns);
}