diff options
Diffstat (limited to 'fs/namespace.c')
-rw-r--r-- | fs/namespace.c | 382 |
1 files changed, 211 insertions, 171 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index d9ca80dcc544..54c59e091919 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -355,12 +355,13 @@ static struct mount *alloc_vfsmnt(const char *name) if (err) goto out_free_cache; - if (name) { + if (name) mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL_ACCOUNT); - if (!mnt->mnt_devname) - goto out_free_id; - } + else + mnt->mnt_devname = "none"; + if (!mnt->mnt_devname) + goto out_free_id; #ifdef CONFIG_SMP mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); @@ -787,15 +788,11 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); - smp_mb(); // see mntput_no_expire() + smp_mb(); // see mntput_no_expire() and do_umount() if (likely(!read_seqretry(&mount_lock, seq))) return 0; - if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { - mnt_add_count(mnt, -1); - return 1; - } lock_mount_hash(); - if (unlikely(bastard->mnt_flags & MNT_DOOMED)) { + if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) { mnt_add_count(mnt, -1); unlock_mount_hash(); return 1; @@ -1268,7 +1265,7 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) if (!fc->root) return ERR_PTR(-EINVAL); - mnt = alloc_vfsmnt(fc->source ?: "none"); + mnt = alloc_vfsmnt(fc->source); if (!mnt) return ERR_PTR(-ENOMEM); @@ -1329,21 +1326,6 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type, } EXPORT_SYMBOL_GPL(vfs_kern_mount); -struct vfsmount * -vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, - const char *name, void *data) -{ - /* Until it is worked out how to pass the user namespace - * through from the parent mount to the submount don't support - * unprivileged mounts with submounts. - */ - if (mountpoint->d_sb->s_user_ns != &init_user_ns) - return ERR_PTR(-EPERM); - - return vfs_kern_mount(type, SB_SUBMOUNT, name, data); -} -EXPORT_SYMBOL_GPL(vfs_submount); - static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { @@ -2048,6 +2030,7 @@ static int do_umount(struct mount *mnt, int flags) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { + smp_mb(); // paired with __legitimize_mnt() shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { @@ -2327,21 +2310,62 @@ out: return dst_mnt; } -/* Caller should check returned pointer for errors */ +static inline bool extend_array(struct path **res, struct path **to_free, + unsigned n, unsigned *count, unsigned new_count) +{ + struct path *p; + + if (likely(n < *count)) + return true; + p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL); + if (p && *count) + memcpy(p, *res, *count * sizeof(struct path)); + *count = new_count; + kfree(*to_free); + *to_free = *res = p; + return p; +} -struct vfsmount *collect_mounts(const struct path *path) +struct path *collect_paths(const struct path *path, + struct path *prealloc, unsigned count) { - struct mount *tree; - namespace_lock(); - if (!check_mnt(real_mount(path->mnt))) - tree = ERR_PTR(-EINVAL); - else - tree = copy_tree(real_mount(path->mnt), path->dentry, - CL_COPY_ALL | CL_PRIVATE); - namespace_unlock(); - if (IS_ERR(tree)) - return ERR_CAST(tree); - return &tree->mnt; + struct mount *root = real_mount(path->mnt); + struct mount *child; + struct path *res = prealloc, *to_free = NULL; + unsigned n = 0; + + guard(rwsem_read)(&namespace_sem); + + if (!check_mnt(root)) + return ERR_PTR(-EINVAL); + if (!extend_array(&res, &to_free, 0, &count, 32)) + return ERR_PTR(-ENOMEM); + res[n++] = *path; + list_for_each_entry(child, &root->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, path->dentry)) + continue; + for (struct mount *m = child; m; m = next_mnt(m, child)) { + if (!extend_array(&res, &to_free, n, &count, 2 * count)) + return ERR_PTR(-ENOMEM); + res[n].mnt = &m->mnt; + res[n].dentry = m->mnt.mnt_root; + n++; + } + } + if (!extend_array(&res, &to_free, n, &count, count + 1)) + return ERR_PTR(-ENOMEM); + memset(res + n, 0, (count - n) * sizeof(struct path)); + for (struct path *p = res; p->mnt; p++) + path_get(p); + return res; +} + +void drop_collected_paths(struct path *paths, struct path *prealloc) +{ + for (struct path *p = paths; p->mnt; p++) + path_put(p); + if (paths != prealloc) + kfree(paths); } static void free_mnt_ns(struct mnt_namespace *); @@ -2418,16 +2442,7 @@ void dissolve_on_fput(struct vfsmount *mnt) free_mnt_ns(ns); } -void drop_collected_mounts(struct vfsmount *mnt) -{ - namespace_lock(); - lock_mount_hash(); - umount_tree(real_mount(mnt), 0); - unlock_mount_hash(); - namespace_unlock(); -} - -bool has_locked_children(struct mount *mnt, struct dentry *dentry) +static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; @@ -2441,6 +2456,16 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) return false; } +bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ + bool res; + + read_seqlock_excl(&mount_lock); + res = __has_locked_children(mnt, dentry); + read_sequnlock_excl(&mount_lock); + return res; +} + /* * Check that there aren't references to earlier/same mount namespaces in the * specified subtree. Such references can act as pins for mount namespaces @@ -2485,23 +2510,27 @@ struct vfsmount *clone_private_mount(const struct path *path) if (IS_MNT_UNBINDABLE(old_mnt)) return ERR_PTR(-EINVAL); - if (mnt_has_parent(old_mnt)) { - if (!check_mnt(old_mnt)) - return ERR_PTR(-EINVAL); - } else { - if (!is_mounted(&old_mnt->mnt)) - return ERR_PTR(-EINVAL); - - /* Make sure this isn't something purely kernel internal. */ - if (!is_anon_ns(old_mnt->mnt_ns)) + /* + * Make sure the source mount is acceptable. + * Anything mounted in our mount namespace is allowed. + * Otherwise, it must be the root of an anonymous mount + * namespace, and we need to make sure no namespace + * loops get created. + */ + if (!check_mnt(old_mnt)) { + if (!is_mounted(&old_mnt->mnt) || + !is_anon_ns(old_mnt->mnt_ns) || + mnt_has_parent(old_mnt)) return ERR_PTR(-EINVAL); - /* Make sure we don't create mount namespace loops. */ if (!check_for_nsfs_mounts(old_mnt)) return ERR_PTR(-EINVAL); } - if (has_locked_children(old_mnt, path->dentry)) + if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (__has_locked_children(old_mnt, path->dentry)) return ERR_PTR(-EINVAL); new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); @@ -2514,21 +2543,6 @@ struct vfsmount *clone_private_mount(const struct path *path) } EXPORT_SYMBOL_GPL(clone_private_mount); -int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, - struct vfsmount *root) -{ - struct mount *mnt; - int res = f(root, arg); - if (res) - return res; - list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { - res = f(&mnt->mnt, arg); - if (res) - return res; - } - return 0; -} - static void lock_mnt_tree(struct mount *mnt) { struct mount *p; @@ -2754,14 +2768,14 @@ static int attach_recursive_mnt(struct mount *source_mnt, hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); - q = __lookup_mnt(&child->mnt_parent->mnt, - child->mnt_mountpoint); - if (q) - mnt_change_mountpoint(child, smp, q); /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); child->mnt.mnt_flags &= ~MNT_LOCKED; + q = __lookup_mnt(&child->mnt_parent->mnt, + child->mnt_mountpoint); + if (q) + mnt_change_mountpoint(child, smp, q); commit_tree(child); } put_mountpoint(smp); @@ -2826,56 +2840,62 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath) struct vfsmount *mnt = path->mnt; struct dentry *dentry; struct mountpoint *mp = ERR_PTR(-ENOENT); + struct path under = {}; for (;;) { - struct mount *m; + struct mount *m = real_mount(mnt); if (beneath) { - m = real_mount(mnt); + path_put(&under); read_seqlock_excl(&mount_lock); - dentry = dget(m->mnt_mountpoint); + under.mnt = mntget(&m->mnt_parent->mnt); + under.dentry = dget(m->mnt_mountpoint); read_sequnlock_excl(&mount_lock); + dentry = under.dentry; } else { dentry = path->dentry; } inode_lock(dentry->d_inode); - if (unlikely(cant_mount(dentry))) { - inode_unlock(dentry->d_inode); - goto out; - } - namespace_lock(); - if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) { + if (unlikely(cant_mount(dentry) || !is_mounted(mnt))) + break; // not to be mounted on + + if (beneath && unlikely(m->mnt_mountpoint != dentry || + &m->mnt_parent->mnt != under.mnt)) { namespace_unlock(); inode_unlock(dentry->d_inode); - goto out; + continue; // got moved } mnt = lookup_mnt(path); - if (likely(!mnt)) + if (unlikely(mnt)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + path_put(path); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); + continue; // got overmounted + } + mp = get_mountpoint(dentry); + if (IS_ERR(mp)) break; - - namespace_unlock(); - inode_unlock(dentry->d_inode); - if (beneath) - dput(dentry); - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - } - - mp = get_mountpoint(dentry); - if (IS_ERR(mp)) { - namespace_unlock(); - inode_unlock(dentry->d_inode); + if (beneath) { + /* + * @under duplicates the references that will stay + * at least until namespace_unlock(), so the path_put() + * below is safe (and OK to do under namespace_lock - + * we are not dropping the final references here). + */ + path_put(&under); + } + return mp; } - -out: + namespace_unlock(); + inode_unlock(dentry->d_inode); if (beneath) - dput(dentry); - + path_put(&under); return mp; } @@ -2886,14 +2906,11 @@ static inline struct mountpoint *lock_mount(struct path *path) static void unlock_mount(struct mountpoint *where) { - struct dentry *dentry = where->m_dentry; - + inode_unlock(where->m_dentry->d_inode); read_seqlock_excl(&mount_lock); put_mountpoint(where); read_sequnlock_excl(&mount_lock); - namespace_unlock(); - inode_unlock(dentry->d_inode); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) @@ -2944,6 +2961,10 @@ static int do_change_type(struct path *path, int ms_flags) return -EINVAL; namespace_lock(); + if (!check_mnt(mnt)) { + err = -EINVAL; + goto out_unlock; + } if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) @@ -3035,7 +3056,7 @@ static struct mount *__do_loopback(struct path *old_path, int recurse) if (!may_copy_tree(old_path)) return mnt; - if (!recurse && has_locked_children(old, old_path->dentry)) + if (!recurse && __has_locked_children(old, old_path->dentry)) return mnt; if (recurse) @@ -3428,7 +3449,7 @@ static int do_set_group(struct path *from_path, struct path *to_path) goto out; /* From mount should not have locked children in place of To's root */ - if (has_locked_children(from, to->mnt.mnt_root)) + if (__has_locked_children(from, to->mnt.mnt_root)) goto out; /* Setting sharing groups is only allowed on private mounts */ @@ -3442,7 +3463,7 @@ static int do_set_group(struct path *from_path, struct path *to_path) if (IS_MNT_SLAVE(from)) { struct mount *m = from->mnt_master; - list_add(&to->mnt_slave, &m->mnt_slave_list); + list_add(&to->mnt_slave, &from->mnt_slave); to->mnt_master = m; } @@ -3467,18 +3488,25 @@ out: * Check if path is overmounted, i.e., if there's a mount on top of * @path->mnt with @path->dentry as mountpoint. * - * Context: This function expects namespace_lock() to be held. + * Context: namespace_sem must be held at least shared. + * MUST NOT be called under lock_mount_hash() (there one should just + * call __lookup_mnt() and check if it returns NULL). * Return: If path is overmounted true is returned, false if not. */ static inline bool path_overmounted(const struct path *path) { + unsigned seq = read_seqbegin(&mount_lock); + bool no_child; + rcu_read_lock(); - if (unlikely(__lookup_mnt(path->mnt, path->dentry))) { - rcu_read_unlock(); - return true; - } + no_child = !__lookup_mnt(path->mnt, path->dentry); rcu_read_unlock(); - return false; + if (need_seqretry(&mount_lock, seq)) { + read_seqlock_excl(&mount_lock); + no_child = !__lookup_mnt(path->mnt, path->dentry); + read_sequnlock_excl(&mount_lock); + } + return unlikely(!no_child); } /** @@ -3557,7 +3585,8 @@ static int can_move_mount_beneath(const struct path *from, * @mnt_from itself. This defeats the whole purpose of mounting * @mnt_from beneath @mnt_to. */ - if (propagation_would_overmount(parent_mnt_to, mnt_from, mp)) + if (check_mnt(mnt_from) && + propagation_would_overmount(parent_mnt_to, mnt_from, mp)) return -EINVAL; return 0; @@ -3636,46 +3665,41 @@ static int do_move_mount(struct path *old_path, ns = old->mnt_ns; err = -EINVAL; - if (!may_use_mount(p)) - goto out; - /* The thing moved must be mounted... */ if (!is_mounted(&old->mnt)) goto out; - /* ... and either ours or the root of anon namespace */ - if (!(attached ? check_mnt(old) : is_anon_ns(ns))) - goto out; - - if (is_anon_ns(ns)) { + if (check_mnt(old)) { + /* if the source is in our namespace... */ + /* ... it should be detachable from parent */ + if (!mnt_has_parent(old) || IS_MNT_LOCKED(old)) + goto out; + /* ... and the target should be in our namespace */ + if (!check_mnt(p)) + goto out; + } else { /* - * Ending up with two files referring to the root of the - * same anonymous mount namespace would cause an error - * as this would mean trying to move the same mount - * twice into the mount tree which would be rejected - * later. But be explicit about it right here. + * otherwise the source must be the root of some anon namespace. + * AV: check for mount being root of an anon namespace is worth + * an inlined predicate... */ - if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns)) + if (!is_anon_ns(ns) || mnt_has_parent(old)) goto out; - /* - * If this is an anonymous mount tree ensure that mount - * propagation can detect mounts that were just - * propagated to the target mount tree so we don't - * propagate onto them. + * Bail out early if the target is within the same namespace - + * subsequent checks would've rejected that, but they lose + * some corner cases if we check it early. */ - ns->mntns_flags |= MNTNS_PROPAGATING; - } else if (is_anon_ns(p->mnt_ns)) { + if (ns == p->mnt_ns) + goto out; /* - * Don't allow moving an attached mount tree to an - * anonymous mount tree. + * Target should be either in our namespace or in an acceptable + * anon namespace, sensu check_anonymous_mnt(). */ - goto out; + if (!may_use_mount(p)) + goto out; } - if (old->mnt.mnt_flags & MNT_LOCKED) - goto out; - if (!path_mounted(old_path)) goto out; @@ -3715,9 +3739,6 @@ static int do_move_mount(struct path *old_path, if (err) goto out; - if (is_anon_ns(ns)) - ns->mntns_flags &= ~MNTNS_PROPAGATING; - /* if the mount is moved, it should no longer be expire * automatically */ list_del_init(&old->mnt_expire); @@ -3899,10 +3920,6 @@ int finish_automount(struct vfsmount *m, const struct path *path) return PTR_ERR(m); mnt = real_mount(m); - /* The new mount record should have at least 2 refs to prevent it being - * expired before we get a chance to add it - */ - BUG_ON(mnt_get_count(mnt) < 2); if (m->mnt_sb == path->mnt->mnt_sb && m->mnt_root == dentry) { @@ -3935,7 +3952,6 @@ int finish_automount(struct vfsmount *m, const struct path *path) unlock_mount(mp); if (unlikely(err)) goto discard; - mntput(m); return 0; discard_locked: @@ -3949,7 +3965,6 @@ discard: namespace_unlock(); } mntput(m); - mntput(m); return err; } @@ -3986,11 +4001,14 @@ void mark_mounts_for_expiry(struct list_head *mounts) /* extract from the expiration list every vfsmount that matches the * following criteria: + * - already mounted * - only referenced by its parent vfsmount * - still marked for expiry (marked on the last call here; marks are * cleared by mntput()) */ list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { + if (!is_mounted(&mnt->mnt)) + continue; if (!xchg(&mnt->mnt_expiry_mark, 1) || propagate_mount_busy(mnt, 1)) continue; @@ -5289,16 +5307,12 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, kattr.kflags |= MOUNT_KATTR_RECURSE; ret = wants_mount_setattr(uattr, usize, &kattr); - if (ret < 0) - return ret; - - if (ret) { + if (ret > 0) { ret = do_mount_setattr(&file->f_path, &kattr); - if (ret) - return ret; - finish_mount_kattr(&kattr); } + if (ret) + return ret; } fd = get_unused_fd_flags(flags & O_CLOEXEC); @@ -5491,7 +5505,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) seq->buf[seq->count] = '\0'; seq->count = start; seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); - } else if (r->mnt_devname) { + } else { seq_puts(seq, r->mnt_devname); } return 0; @@ -5804,7 +5818,9 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) STATMOUNT_SB_SOURCE | \ STATMOUNT_OPT_ARRAY | \ STATMOUNT_OPT_SEC_ARRAY | \ - STATMOUNT_SUPPORTED_MASK) + STATMOUNT_SUPPORTED_MASK | \ + STATMOUNT_MNT_UIDMAP | \ + STATMOUNT_MNT_GIDMAP) static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) @@ -5839,13 +5855,29 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, return err; s->root = root; - s->idmap = mnt_idmap(s->mnt); - if (s->mask & STATMOUNT_SB_BASIC) - statmount_sb_basic(s); + /* + * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap + * can change concurrently as we only hold the read-side of the + * namespace semaphore and mount properties may change with only + * the mount lock held. + * + * We could sample the mount lock sequence counter to detect + * those changes and retry. But it's not worth it. Worst that + * happens is that the mnt->mnt_idmap pointer is already changed + * while mnt->mnt_flags isn't or vica versa. So what. + * + * Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved + * via READ_ONCE()/WRITE_ONCE() and guard against theoretical + * torn read/write. That's all we care about right now. + */ + s->idmap = mnt_idmap(s->mnt); if (s->mask & STATMOUNT_MNT_BASIC) statmount_mnt_basic(s); + if (s->mask & STATMOUNT_SB_BASIC) + statmount_sb_basic(s); + if (s->mask & STATMOUNT_PROPAGATE_FROM) statmount_propagate_from(s); @@ -6157,6 +6189,10 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; + /* + * We only need to guard against mount topology changes as + * listmount() doesn't care about any mount properties. + */ scoped_guard(rwsem_read, &namespace_sem) ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids, nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); @@ -6239,7 +6275,11 @@ void put_mnt_ns(struct mnt_namespace *ns) { if (!refcount_dec_and_test(&ns->ns.count)) return; - drop_collected_mounts(&ns->root->mnt); + namespace_lock(); + lock_mount_hash(); + umount_tree(ns->root, 0); + unlock_mount_hash(); + namespace_unlock(); free_mnt_ns(ns); } |