diff options
-rw-r--r-- | fs/dcache.c | 4 | ||||
-rw-r--r-- | fs/ecryptfs/dentry.c | 14 | ||||
-rw-r--r-- | fs/ecryptfs/ecryptfs_kernel.h | 27 | ||||
-rw-r--r-- | fs/ecryptfs/file.c | 15 | ||||
-rw-r--r-- | fs/ecryptfs/inode.c | 19 | ||||
-rw-r--r-- | fs/ecryptfs/main.c | 24 | ||||
-rw-r--r-- | fs/internal.h | 4 | ||||
-rw-r--r-- | fs/mount.h | 39 | ||||
-rw-r--r-- | fs/namespace.c | 996 | ||||
-rw-r--r-- | fs/pnode.c | 75 | ||||
-rw-r--r-- | fs/pnode.h | 1 | ||||
-rw-r--r-- | fs/super.c | 3 | ||||
-rw-r--r-- | include/linux/fs.h | 4 | ||||
-rw-r--r-- | include/linux/mount.h | 9 | ||||
-rw-r--r-- | kernel/audit_tree.c | 12 |
15 files changed, 602 insertions, 644 deletions
diff --git a/fs/dcache.c b/fs/dcache.c index 65cc11939654..b2b3b20b55f6 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1390,6 +1390,7 @@ struct check_mount { unsigned int mounted; }; +/* locks: mount_locked_reader && dentry->d_lock */ static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { struct check_mount *info = data; @@ -1416,9 +1417,8 @@ int path_has_submounts(const struct path *parent) { struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; - read_seqlock_excl(&mount_lock); + guard(mount_locked_reader)(); d_walk(parent->dentry, &data, path_check_mount); - read_sequnlock_excl(&mount_lock); return data.mounted; } diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 1dfd5b81d831..6648a924e31a 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -59,14 +59,6 @@ static int ecryptfs_d_revalidate(struct inode *dir, const struct qstr *name, return rc; } -struct kmem_cache *ecryptfs_dentry_info_cache; - -static void ecryptfs_dentry_free_rcu(struct rcu_head *head) -{ - kmem_cache_free(ecryptfs_dentry_info_cache, - container_of(head, struct ecryptfs_dentry_info, rcu)); -} - /** * ecryptfs_d_release * @dentry: The ecryptfs dentry @@ -75,11 +67,7 @@ static void ecryptfs_dentry_free_rcu(struct rcu_head *head) */ static void ecryptfs_d_release(struct dentry *dentry) { - struct ecryptfs_dentry_info *p = dentry->d_fsdata; - if (p) { - path_put(&p->lower_path); - call_rcu(&p->rcu, ecryptfs_dentry_free_rcu); - } + dput(dentry->d_fsdata); } const struct dentry_operations ecryptfs_dops = { diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 1f562e75d0e4..9e6ab0b41337 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -258,13 +258,6 @@ struct ecryptfs_inode_info { struct ecryptfs_crypt_stat crypt_stat; }; -/* dentry private data. Each dentry must keep track of a lower - * vfsmount too. */ -struct ecryptfs_dentry_info { - struct path lower_path; - struct rcu_head rcu; -}; - /** * ecryptfs_global_auth_tok - A key used to encrypt all new files under the mountpoint * @flags: Status flags @@ -348,6 +341,7 @@ struct ecryptfs_mount_crypt_stat { /* superblock private data. */ struct ecryptfs_sb_info { struct super_block *wsi_sb; + struct vfsmount *lower_mnt; struct ecryptfs_mount_crypt_stat mount_crypt_stat; }; @@ -494,22 +488,25 @@ ecryptfs_set_superblock_lower(struct super_block *sb, } static inline void -ecryptfs_set_dentry_private(struct dentry *dentry, - struct ecryptfs_dentry_info *dentry_info) +ecryptfs_set_dentry_lower(struct dentry *dentry, + struct dentry *lower_dentry) { - dentry->d_fsdata = dentry_info; + dentry->d_fsdata = lower_dentry; } static inline struct dentry * ecryptfs_dentry_to_lower(struct dentry *dentry) { - return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry; + return dentry->d_fsdata; } -static inline const struct path * -ecryptfs_dentry_to_lower_path(struct dentry *dentry) +static inline struct path +ecryptfs_lower_path(struct dentry *dentry) { - return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path; + return (struct path){ + .mnt = ecryptfs_superblock_to_private(dentry->d_sb)->lower_mnt, + .dentry = ecryptfs_dentry_to_lower(dentry) + }; } #define ecryptfs_printk(type, fmt, arg...) \ @@ -532,7 +529,6 @@ extern unsigned int ecryptfs_number_of_users; extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache; extern struct kmem_cache *ecryptfs_file_info_cache; -extern struct kmem_cache *ecryptfs_dentry_info_cache; extern struct kmem_cache *ecryptfs_inode_info_cache; extern struct kmem_cache *ecryptfs_sb_info_cache; extern struct kmem_cache *ecryptfs_header_cache; @@ -557,7 +553,6 @@ int ecryptfs_encrypt_and_encode_filename( size_t *encoded_name_size, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, const char *name, size_t name_size); -struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); void ecryptfs_dump_hex(char *data, int bytes); int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, int sg_size); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 5f8f96da09fe..7929411837cf 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -33,13 +33,12 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, struct iov_iter *to) { ssize_t rc; - const struct path *path; struct file *file = iocb->ki_filp; rc = generic_file_read_iter(iocb, to); if (rc >= 0) { - path = ecryptfs_dentry_to_lower_path(file->f_path.dentry); - touch_atime(path); + struct path path = ecryptfs_lower_path(file->f_path.dentry); + touch_atime(&path); } return rc; } @@ -59,12 +58,11 @@ static ssize_t ecryptfs_splice_read_update_atime(struct file *in, loff_t *ppos, size_t len, unsigned int flags) { ssize_t rc; - const struct path *path; rc = filemap_splice_read(in, ppos, pipe, len, flags); if (rc >= 0) { - path = ecryptfs_dentry_to_lower_path(in->f_path.dentry); - touch_atime(path); + struct path path = ecryptfs_lower_path(in->f_path.dentry); + touch_atime(&path); } return rc; } @@ -283,6 +281,7 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file) * ecryptfs_lookup() */ struct ecryptfs_file_info *file_info; struct file *lower_file; + struct path path; /* Released in ecryptfs_release or end of function if failure */ file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); @@ -292,8 +291,8 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file) "Error attempting to allocate memory\n"); return -ENOMEM; } - lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry), - file->f_flags, current_cred()); + path = ecryptfs_lower_path(ecryptfs_dentry); + lower_file = dentry_open(&path, file->f_flags, current_cred()); if (IS_ERR(lower_file)) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index abd954c6a14e..ed1394da8d6b 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -327,24 +327,15 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, struct dentry *lower_dentry) { - const struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent); + struct dentry *lower_parent = ecryptfs_dentry_to_lower(dentry->d_parent); struct inode *inode, *lower_inode; - struct ecryptfs_dentry_info *dentry_info; int rc = 0; - dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); - if (!dentry_info) { - dput(lower_dentry); - return ERR_PTR(-ENOMEM); - } - fsstack_copy_attr_atime(d_inode(dentry->d_parent), - d_inode(path->dentry)); + d_inode(lower_parent)); BUG_ON(!d_count(lower_dentry)); - ecryptfs_set_dentry_private(dentry, dentry_info); - dentry_info->lower_path.mnt = mntget(path->mnt); - dentry_info->lower_path.dentry = lower_dentry; + ecryptfs_set_dentry_lower(dentry, lower_dentry); /* * negative dentry can go positive under us here - its parent is not @@ -1021,10 +1012,10 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap, { struct dentry *dentry = path->dentry; struct kstat lower_stat; + struct path lower_path = ecryptfs_lower_path(dentry); int rc; - rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry), - &lower_stat, request_mask, flags); + rc = vfs_getattr_nosec(&lower_path, &lower_stat, request_mask, flags); if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index eab1beb846d3..16ea14dd2c62 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -106,15 +106,14 @@ static int ecryptfs_init_lower_file(struct dentry *dentry, struct file **lower_file) { const struct cred *cred = current_cred(); - const struct path *path = ecryptfs_dentry_to_lower_path(dentry); + struct path path = ecryptfs_lower_path(dentry); int rc; - rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt, - cred); + rc = ecryptfs_privileged_open(lower_file, path.dentry, path.mnt, cred); if (rc) { printk(KERN_ERR "Error opening lower file " "for lower_dentry [0x%p] and lower_mnt [0x%p]; " - "rc = [%d]\n", path->dentry, path->mnt, rc); + "rc = [%d]\n", path.dentry, path.mnt, rc); (*lower_file) = NULL; } return rc; @@ -437,7 +436,6 @@ static int ecryptfs_get_tree(struct fs_context *fc) struct ecryptfs_fs_context *ctx = fc->fs_private; struct ecryptfs_sb_info *sbi = fc->s_fs_info; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; - struct ecryptfs_dentry_info *root_info; const char *err = "Getting sb failed"; struct inode *inode; struct path path; @@ -543,14 +541,8 @@ static int ecryptfs_get_tree(struct fs_context *fc) goto out_free; } - rc = -ENOMEM; - root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); - if (!root_info) - goto out_free; - - /* ->kill_sb() will take care of root_info */ - ecryptfs_set_dentry_private(s->s_root, root_info); - root_info->lower_path = path; + ecryptfs_set_dentry_lower(s->s_root, path.dentry); + ecryptfs_superblock_to_private(s)->lower_mnt = path.mnt; s->s_flags |= SB_ACTIVE; fc->root = dget(s->s_root); @@ -580,6 +572,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb) kill_anon_super(sb); if (!sb_info) return; + mntput(sb_info->lower_mnt); ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } @@ -668,11 +661,6 @@ static struct ecryptfs_cache_info { .size = sizeof(struct ecryptfs_file_info), }, { - .cache = &ecryptfs_dentry_info_cache, - .name = "ecryptfs_dentry_info_cache", - .size = sizeof(struct ecryptfs_dentry_info), - }, - { .cache = &ecryptfs_inode_info_cache, .name = "ecryptfs_inode_cache", .size = sizeof(struct ecryptfs_inode_info), diff --git a/fs/internal.h b/fs/internal.h index a33d18ee5b74..b5c62abefff4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -84,9 +84,9 @@ void mnt_put_write_access_file(struct file *file); extern void dissolve_on_fput(struct vfsmount *); extern bool may_mount(void); -int path_mount(const char *dev_name, struct path *path, +int path_mount(const char *dev_name, const struct path *path, const char *type_page, unsigned long flags, void *data_page); -int path_umount(struct path *path, int flags); +int path_umount(const struct path *path, int flags); int show_path(struct seq_file *m, struct dentry *root); diff --git a/fs/mount.h b/fs/mount.h index 79c85639a7ba..f13a28752d0b 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -58,7 +58,10 @@ struct mount { #endif struct list_head mnt_mounts; /* list of children, anchored here */ struct list_head mnt_child; /* and going through their mnt_child */ - struct list_head mnt_instance; /* mount instance on sb->s_mounts */ + struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */ + struct mount * __aligned(1) *mnt_pprev_for_sb; + /* except that LSB of pprev is stolen */ +#define WRITE_HOLD 1 /* ... for use by mnt_hold_writers() */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ @@ -148,6 +151,11 @@ static inline void get_mnt_ns(struct mnt_namespace *ns) extern seqlock_t mount_lock; +DEFINE_LOCK_GUARD_0(mount_writer, write_seqlock(&mount_lock), + write_sequnlock(&mount_lock)) +DEFINE_LOCK_GUARD_0(mount_locked_reader, read_seqlock_excl(&mount_lock), + read_sequnlock_excl(&mount_lock)) + struct proc_mounts { struct mnt_namespace *ns; struct path root; @@ -224,4 +232,33 @@ static inline void mnt_notify_add(struct mount *m) } #endif +static inline struct mount *topmost_overmount(struct mount *m) +{ + while (m->overmount) + m = m->overmount; + return m; +} + +static inline bool __test_write_hold(struct mount * __aligned(1) *val) +{ + return (unsigned long)val & WRITE_HOLD; +} + +static inline bool test_write_hold(const struct mount *m) +{ + return __test_write_hold(m->mnt_pprev_for_sb); +} + +static inline void set_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + | WRITE_HOLD); +} + +static inline void clear_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + & ~WRITE_HOLD); +} + struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); diff --git a/fs/namespace.c b/fs/namespace.c index dc01b14c58cd..d39499ab5cb5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -91,6 +91,14 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ +static inline void namespace_lock(void); +static void namespace_unlock(void); +DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock()) +DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem), + up_read(&namespace_sem)) + +DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T)) + #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif @@ -363,7 +371,7 @@ out_free_cache: * mnt_want/drop_write() will _keep_ the filesystem * r/w. */ -bool __mnt_is_readonly(struct vfsmount *mnt) +bool __mnt_is_readonly(const struct vfsmount *mnt) { return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); } @@ -403,7 +411,7 @@ static unsigned int mnt_get_writers(struct mount *mnt) #endif } -static int mnt_is_readonly(struct vfsmount *mnt) +static int mnt_is_readonly(const struct vfsmount *mnt) { if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) return 1; @@ -444,31 +452,31 @@ int mnt_get_write_access(struct vfsmount *m) mnt_inc_writers(mnt); /* * The store to mnt_inc_writers must be visible before we pass - * MNT_WRITE_HOLD loop below, so that the slowpath can see our - * incremented count after it has set MNT_WRITE_HOLD. + * WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set WRITE_HOLD. */ smp_mb(); might_lock(&mount_lock.lock); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { cpu_relax(); } else { /* * This prevents priority inversion, if the task - * setting MNT_WRITE_HOLD got preempted on a remote + * setting WRITE_HOLD got preempted on a remote * CPU, and it prevents life lock if the task setting - * MNT_WRITE_HOLD has a lower priority and is bound to + * WRITE_HOLD has a lower priority and is bound to * the same CPU as the task that is spinning here. */ preempt_enable(); - lock_mount_hash(); - unlock_mount_hash(); + read_seqlock_excl(&mount_lock); + read_sequnlock_excl(&mount_lock); preempt_disable(); } } /* * The barrier pairs with the barrier sb_start_ro_state_change() making - * sure that if we see MNT_WRITE_HOLD cleared, we will also see + * sure that if we see WRITE_HOLD cleared, we will also see * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in * mnt_is_readonly() and bail in case we are racing with remount * read-only. @@ -606,16 +614,16 @@ EXPORT_SYMBOL(mnt_drop_write_file); * a call to mnt_unhold_writers() in order to stop preventing write access to * @mnt. * - * Context: This function expects lock_mount_hash() to be held serializing - * setting MNT_WRITE_HOLD. + * Context: This function expects to be in mount_locked_reader scope serializing + * setting WRITE_HOLD. * Return: On success 0 is returned. * On error, -EBUSY is returned. */ static inline int mnt_hold_writers(struct mount *mnt) { - mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + set_write_hold(mnt); /* - * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * After storing WRITE_HOLD, we'll read the counters. This store * should be visible before we do. */ smp_mb(); @@ -631,9 +639,9 @@ static inline int mnt_hold_writers(struct mount *mnt) * sum up each counter, if we read a counter before it is incremented, * but then read another CPU's count which it has been subsequently * decremented from -- we would see more decrements than we should. - * MNT_WRITE_HOLD protects against this scenario, because + * WRITE_HOLD protects against this scenario, because * mnt_want_write first increments count, then smp_mb, then spins on - * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ if (mnt_get_writers(mnt) > 0) @@ -649,19 +657,42 @@ static inline int mnt_hold_writers(struct mount *mnt) * Stop preventing write access to @mnt allowing callers to gain write access * to @mnt again. * - * This function can only be called after a successful call to - * mnt_hold_writers(). + * This function can only be called after a call to mnt_hold_writers(). * - * Context: This function expects lock_mount_hash() to be held. + * Context: This function expects to be in the same mount_locked_reader scope + * as the matching mnt_hold_writers(). */ static inline void mnt_unhold_writers(struct mount *mnt) { + if (!test_write_hold(mnt)) + return; /* - * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * MNT_READONLY must become visible before ~WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + clear_write_hold(mnt); +} + +static inline void mnt_del_instance(struct mount *m) +{ + struct mount **p = m->mnt_pprev_for_sb; + struct mount *next = m->mnt_next_for_sb; + + if (next) + next->mnt_pprev_for_sb = p; + *p = next; +} + +static inline void mnt_add_instance(struct mount *m, struct super_block *s) +{ + struct mount *first = s->s_mounts; + + if (first) + first->mnt_pprev_for_sb = &m->mnt_next_for_sb; + m->mnt_next_for_sb = first; + m->mnt_pprev_for_sb = &s->s_mounts; + s->s_mounts = m; } static int mnt_make_readonly(struct mount *mnt) @@ -677,17 +708,17 @@ static int mnt_make_readonly(struct mount *mnt) int sb_prepare_remount_readonly(struct super_block *sb) { - struct mount *mnt; int err = 0; - /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ + /* Racy optimization. Recheck the counter under WRITE_HOLD */ if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; - lock_mount_hash(); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { - err = mnt_hold_writers(mnt); + guard(mount_locked_reader)(); + + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (!(m->mnt.mnt_flags & MNT_READONLY)) { + err = mnt_hold_writers(m); if (err) break; } @@ -697,11 +728,10 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err) sb_start_ro_state_change(sb); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (test_write_hold(m)) + clear_write_hold(m); } - unlock_mount_hash(); return err; } @@ -760,24 +790,16 @@ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) } /** - * __lookup_mnt - find first child mount + * __lookup_mnt - mount hash lookup * @mnt: parent mount - * @dentry: mountpoint - * - * If @mnt has a child mount @c mounted @dentry find and return it. + * @dentry: dentry of mountpoint * - * Note that the child mount @c need not be unique. There are cases - * where shadow mounts are created. For example, during mount - * propagation when a source mount @mnt whose root got overmounted by a - * mount @o after path lookup but before @namespace_sem could be - * acquired gets copied and propagated. So @mnt gets copied including - * @o. When @mnt is propagated to a destination mount @d that already - * has another mount @n mounted at the same mountpoint then the source - * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on - * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt - * on @dentry. + * If @mnt has a child mount @c mounted on @dentry find and return it. + * Caller must either hold the spinlock component of @mount_lock or + * hold rcu_read_lock(), sample the seqcount component before the call + * and recheck it afterwards. * - * Return: The first child of @mnt mounted @dentry or NULL. + * Return: The child of @mnt mounted on @dentry or %NULL. */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { @@ -790,21 +812,12 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) return NULL; } -/* - * lookup_mnt - Return the first child mount mounted at path - * - * "First" means first mounted chronologically. If you create the - * following mounts: - * - * mount /dev/sda1 /mnt - * mount /dev/sda2 /mnt - * mount /dev/sda3 /mnt - * - * Then lookup_mnt() on the base /mnt dentry in the root mount will - * return successively the root dentry and vfsmount of /dev/sda1, then - * /dev/sda2, then /dev/sda3, then NULL. +/** + * lookup_mnt - Return the child mount mounted at given location + * @path: location in the namespace * - * lookup_mnt takes a reference to the found vfsmount. + * Acquires and returns a new reference to mount at given location + * or %NULL if nothing is mounted there. */ struct vfsmount *lookup_mnt(const struct path *path) { @@ -841,22 +854,20 @@ bool __is_local_mountpoint(const struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt, *n; - bool is_covered = false; - down_read(&namespace_sem); - rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { - is_covered = (mnt->mnt_mountpoint == dentry); - if (is_covered) - break; - } - up_read(&namespace_sem); + guard(namespace_shared)(); - return is_covered; + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) + if (mnt->mnt_mountpoint == dentry) + return true; + + return false; } struct pinned_mountpoint { struct hlist_node node; struct mountpoint *mp; + struct mount *parent; }; static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) @@ -947,7 +958,7 @@ static void unpin_mountpoint(struct pinned_mountpoint *m) } } -static inline int check_mnt(struct mount *mnt) +static inline int check_mnt(const struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; } @@ -1149,6 +1160,20 @@ static void commit_tree(struct mount *mnt) touch_mnt_namespace(n); } +static void setup_mnt(struct mount *m, struct dentry *root) +{ + struct super_block *s = root->d_sb; + + atomic_inc(&s->s_active); + m->mnt.mnt_sb = s; + m->mnt.mnt_root = dget(root); + m->mnt_mountpoint = m->mnt.mnt_root; + m->mnt_parent = m; + + guard(mount_locked_reader)(); + mnt_add_instance(m, s); +} + /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached @@ -1172,15 +1197,8 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; - atomic_inc(&fc->root->d_sb->s_active); - mnt->mnt.mnt_sb = fc->root->d_sb; - mnt->mnt.mnt_root = dget(fc->root); - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; + setup_mnt(mnt, fc->root); - lock_mount_hash(); - list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); - unlock_mount_hash(); return &mnt->mnt; } EXPORT_SYMBOL(vfs_create_mount); @@ -1238,7 +1256,6 @@ EXPORT_SYMBOL_GPL(vfs_kern_mount); static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { - struct super_block *sb = old->mnt.mnt_sb; struct mount *mnt; int err; @@ -1263,16 +1280,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (mnt->mnt_group_id) set_mnt_shared(mnt); - atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); - mnt->mnt.mnt_sb = sb; - mnt->mnt.mnt_root = dget(root); - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; - lock_mount_hash(); - list_add_tail(&mnt->mnt_instance, &sb->s_mounts); - unlock_mount_hash(); + setup_mnt(mnt, root); if (flag & CL_PRIVATE) // we are done with it return mnt; @@ -1378,7 +1388,7 @@ static void mntput_no_expire(struct mount *mnt) mnt->mnt.mnt_flags |= MNT_DOOMED; rcu_read_unlock(); - list_del(&mnt->mnt_instance); + mnt_del_instance(mnt); if (unlikely(!list_empty(&mnt->mnt_expire))) list_del(&mnt->mnt_expire); @@ -1719,8 +1729,6 @@ static inline void namespace_lock(void) down_write(&namespace_sem); } -DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock()) - enum umount_tree_flags { UMOUNT_SYNC = 1, UMOUNT_PROPAGATE = 2, @@ -1785,6 +1793,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); + bulk_make_private(&tmp_list); + while (!list_empty(&tmp_list)) { struct mnt_namespace *ns; bool disconnect; @@ -1809,7 +1819,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) umount_mnt(p); } } - change_mnt_propagation(p, MS_PRIVATE); if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); @@ -1969,10 +1978,11 @@ void __detach_mounts(struct dentry *dentry) struct pinned_mountpoint mp = {}; struct mount *mnt; - namespace_lock(); - lock_mount_hash(); + guard(namespace_excl)(); + guard(mount_writer)(); + if (!lookup_mountpoint(dentry, &mp)) - goto out_unlock; + return; event++; while (mp.node.next) { @@ -1984,9 +1994,6 @@ void __detach_mounts(struct dentry *dentry) else umount_tree(mnt, UMOUNT_CONNECTED); } unpin_mountpoint(&mp); -out_unlock: - unlock_mount_hash(); - namespace_unlock(); } /* @@ -2025,7 +2032,7 @@ static int can_umount(const struct path *path, int flags) } // caller is responsible for flags being sane -int path_umount(struct path *path, int flags) +int path_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); int ret; @@ -2238,7 +2245,7 @@ static inline bool extend_array(struct path **res, struct path **to_free, return p; } -struct path *collect_paths(const struct path *path, +const struct path *collect_paths(const struct path *path, struct path *prealloc, unsigned count) { struct mount *root = real_mount(path->mnt); @@ -2246,7 +2253,7 @@ struct path *collect_paths(const struct path *path, struct path *res = prealloc, *to_free = NULL; unsigned n = 0; - guard(rwsem_read)(&namespace_sem); + guard(namespace_shared)(); if (!check_mnt(root)) return ERR_PTR(-EINVAL); @@ -2272,9 +2279,9 @@ struct path *collect_paths(const struct path *path, return res; } -void drop_collected_paths(struct path *paths, struct path *prealloc) +void drop_collected_paths(const struct path *paths, const struct path *prealloc) { - for (struct path *p = paths; p->mnt; p++) + for (const struct path *p = paths; p->mnt; p++) path_put(p); if (paths != prealloc) kfree(paths); @@ -2301,7 +2308,7 @@ void dissolve_on_fput(struct vfsmount *mnt) return; } - scoped_guard(namespace_lock, &namespace_sem) { + scoped_guard(namespace_excl) { if (!anon_ns_root(m)) return; @@ -2312,6 +2319,7 @@ void dissolve_on_fput(struct vfsmount *mnt) } } +/* locks: namespace_shared && pinned(mnt) || mount_locked_reader */ static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; @@ -2328,12 +2336,8 @@ static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) bool has_locked_children(struct mount *mnt, struct dentry *dentry) { - bool res; - - read_seqlock_excl(&mount_lock); - res = __has_locked_children(mnt, dentry); - read_sequnlock_excl(&mount_lock); - return res; + guard(mount_locked_reader)(); + return __has_locked_children(mnt, dentry); } /* @@ -2341,21 +2345,15 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) * specified subtree. Such references can act as pins for mount namespaces * that aren't checked by the mount-cycle checking code, thereby allowing * cycles to be made. + * + * locks: mount_locked_reader || namespace_shared && pinned(subtree) */ static bool check_for_nsfs_mounts(struct mount *subtree) { - struct mount *p; - bool ret = false; - - lock_mount_hash(); - for (p = subtree; p; p = next_mnt(p, subtree)) + for (struct mount *p = subtree; p; p = next_mnt(p, subtree)) if (mnt_ns_loop(p->mnt.mnt_root)) - goto out; - - ret = true; -out: - unlock_mount_hash(); - return ret; + return false; + return true; } /** @@ -2375,7 +2373,7 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; - guard(rwsem_read)(&namespace_sem); + guard(namespace_shared)(); if (IS_MNT_UNBINDABLE(old_mnt)) return ERR_PTR(-EINVAL); @@ -2496,8 +2494,7 @@ enum mnt_tree_flags_t { /** * attach_recursive_mnt - attach a source mount tree * @source_mnt: mount tree to be attached - * @dest_mnt: mount that @source_mnt will be mounted on - * @dest_mp: the mountpoint @source_mnt will be mounted at + * @dest: the context for mounting at the place where the tree should go * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. @@ -2560,10 +2557,11 @@ enum mnt_tree_flags_t { * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct mount *dest_mnt, - struct mountpoint *dest_mp) + const struct pinned_mountpoint *dest) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; + struct mount *dest_mnt = dest->parent; + struct mountpoint *dest_mp = dest->mp; HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; struct pinned_mountpoint root = {}; @@ -2643,10 +2641,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, child->mnt_mountpoint); commit_tree(child); if (q) { + struct mount *r = topmost_overmount(child); struct mountpoint *mp = root.mp; - struct mount *r = child; - while (unlikely(r->overmount)) - r = r->overmount; + if (unlikely(shorter) && child != source_mnt) mp = shorter; mnt_change_mountpoint(r, mp, q); @@ -2675,110 +2672,120 @@ static int attach_recursive_mnt(struct mount *source_mnt, return err; } +static inline struct mount *where_to_mount(const struct path *path, + struct dentry **dentry, + bool beneath) +{ + struct mount *m; + + if (unlikely(beneath)) { + m = topmost_overmount(real_mount(path->mnt)); + *dentry = m->mnt_mountpoint; + return m->mnt_parent; + } + m = __lookup_mnt(path->mnt, path->dentry); + if (unlikely(m)) { + m = topmost_overmount(m); + *dentry = m->mnt.mnt_root; + return m; + } + *dentry = path->dentry; + return real_mount(path->mnt); +} + /** - * do_lock_mount - lock mount and mountpoint - * @path: target path - * @beneath: whether the intention is to mount beneath @path - * - * Follow the mount stack on @path until the top mount @mnt is found. If - * the initial @path->{mnt,dentry} is a mountpoint lookup the first - * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root} - * until nothing is stacked on top of it anymore. + * do_lock_mount - acquire environment for mounting + * @path: target path + * @res: context to set up + * @beneath: whether the intention is to mount beneath @path * - * Acquire the inode_lock() on the top mount's ->mnt_root to protect - * against concurrent removal of the new mountpoint from another mount - * namespace. + * To mount something at given location, we need + * namespace_sem locked exclusive + * inode of dentry we are mounting on locked exclusive + * struct mountpoint for that dentry + * struct mount we are mounting on * - * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint - * @mp on @mnt->mnt_parent must be acquired. This protects against a - * concurrent unlink of @mp->mnt_dentry from another mount namespace - * where @mnt doesn't have a child mount mounted @mp. A concurrent - * removal of @mnt->mnt_root doesn't matter as nothing will be mounted - * on top of it for @beneath. + * Results are stored in caller-supplied context (pinned_mountpoint); + * on success we have res->parent and res->mp pointing to parent and + * mountpoint respectively and res->node inserted into the ->m_list + * of the mountpoint, making sure the mountpoint won't disappear. + * On failure we have res->parent set to ERR_PTR(-E...), res->mp + * left NULL, res->node - empty. + * In case of success do_lock_mount returns with locks acquired (in + * proper order - inode lock nests outside of namespace_sem). * - * In addition, @beneath needs to make sure that @mnt hasn't been - * unmounted or moved from its current mountpoint in between dropping - * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt - * being unmounted would be detected later by e.g., calling - * check_mnt(mnt) in the function it's called from. For the @beneath - * case however, it's useful to detect it directly in do_lock_mount(). - * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points - * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will - * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL. + * Request to mount on overmounted location is treated as "mount on + * top of whatever's overmounting it"; request to mount beneath + * a location - "mount immediately beneath the topmost mount at that + * place". * - * Return: Either the target mountpoint on the top mount or the top - * mount's mountpoint. + * In all cases the location must not have been unmounted and the + * chosen mountpoint must be allowed to be mounted on. For "beneath" + * case we also require the location to be at the root of a mount + * that has a parent (i.e. is not a root of some namespace). */ -static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath) +static void do_lock_mount(const struct path *path, + struct pinned_mountpoint *res, + bool beneath) { - struct vfsmount *mnt = path->mnt; - struct dentry *dentry; - struct path under = {}; - int err = -ENOENT; + int err; - for (;;) { - struct mount *m = real_mount(mnt); + if (unlikely(beneath) && !path_mounted(path)) { + res->parent = ERR_PTR(-EINVAL); + return; + } - if (beneath) { - path_put(&under); - read_seqlock_excl(&mount_lock); - under.mnt = mntget(&m->mnt_parent->mnt); - under.dentry = dget(m->mnt_mountpoint); - read_sequnlock_excl(&mount_lock); - dentry = under.dentry; - } else { - dentry = path->dentry; + do { + struct dentry *dentry, *d; + struct mount *m, *n; + + scoped_guard(mount_locked_reader) { + m = where_to_mount(path, &dentry, beneath); + if (&m->mnt != path->mnt) { + mntget(&m->mnt); + dget(dentry); + } } inode_lock(dentry->d_inode); namespace_lock(); - if (unlikely(cant_mount(dentry) || !is_mounted(mnt))) - break; // not to be mounted on + // check if the chain of mounts (if any) has changed. + scoped_guard(mount_locked_reader) + n = where_to_mount(path, &d, beneath); - if (beneath && unlikely(m->mnt_mountpoint != dentry || - &m->mnt_parent->mnt != under.mnt)) { - namespace_unlock(); - inode_unlock(dentry->d_inode); - continue; // got moved - } + if (unlikely(n != m || dentry != d)) + err = -EAGAIN; // something moved, retry + else if (unlikely(cant_mount(dentry) || !is_mounted(path->mnt))) + err = -ENOENT; // not to be mounted on + else if (beneath && &m->mnt == path->mnt && !m->overmount) + err = -EINVAL; + else + err = get_mountpoint(dentry, res); - mnt = lookup_mnt(path); - if (unlikely(mnt)) { + if (unlikely(err)) { + res->parent = ERR_PTR(err); namespace_unlock(); inode_unlock(dentry->d_inode); - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - continue; // got overmounted + } else { + res->parent = m; } - err = get_mountpoint(dentry, pinned); - if (err) - break; - if (beneath) { - /* - * @under duplicates the references that will stay - * at least until namespace_unlock(), so the path_put() - * below is safe (and OK to do under namespace_lock - - * we are not dropping the final references here). - */ - path_put(&under); + /* + * Drop the temporary references. This is subtle - on success + * we are doing that under namespace_sem, which would normally + * be forbidden. However, in that case we are guaranteed that + * refcounts won't reach zero, since we know that path->mnt + * is mounted and thus all mounts reachable from it are pinned + * and stable, along with their mountpoints and roots. + */ + if (&m->mnt != path->mnt) { + dput(dentry); + mntput(&m->mnt); } - return 0; - } - namespace_unlock(); - inode_unlock(dentry->d_inode); - if (beneath) - path_put(&under); - return err; -} - -static inline int lock_mount(struct path *path, struct pinned_mountpoint *m) -{ - return do_lock_mount(path, m, false); + } while (err == -EAGAIN); } -static void unlock_mount(struct pinned_mountpoint *m) +static void __unlock_mount(struct pinned_mountpoint *m) { inode_unlock(m->mp->m_dentry->d_inode); read_seqlock_excl(&mount_lock); @@ -2787,16 +2794,30 @@ static void unlock_mount(struct pinned_mountpoint *m) namespace_unlock(); } -static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) +static inline void unlock_mount(struct pinned_mountpoint *m) +{ + if (!IS_ERR(m->parent)) + __unlock_mount(m); +} + +#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \ + struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ + do_lock_mount((path), &mp, (beneath)) +#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false) +#define LOCK_MOUNT_EXACT(mp, path) \ + struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ + lock_mount_exact((path), &mp) + +static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp) { if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER) return -EINVAL; - if (d_is_dir(mp->m_dentry) != + if (d_is_dir(mp->mp->m_dentry) != d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; - return attach_recursive_mnt(mnt, p, mp); + return attach_recursive_mnt(mnt, mp); } static int may_change_propagation(const struct mount *m) @@ -2832,13 +2853,13 @@ static int flags_to_propagation_type(int ms_flags) /* * recursively change the type of the mountpoint. */ -static int do_change_type(struct path *path, int ms_flags) +static int do_change_type(const struct path *path, int ms_flags) { struct mount *m; struct mount *mnt = real_mount(path->mnt); int recurse = ms_flags & MS_REC; int type; - int err = 0; + int err; if (!path_mounted(path)) return -EINVAL; @@ -2847,23 +2868,22 @@ static int do_change_type(struct path *path, int ms_flags) if (!type) return -EINVAL; - namespace_lock(); + guard(namespace_excl)(); + err = may_change_propagation(mnt); if (err) - goto out_unlock; + return err; if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) - goto out_unlock; + return err; } for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - out_unlock: - namespace_unlock(); - return err; + return 0; } /* may_copy_tree() - check if a mount tree can be copied @@ -2909,7 +2929,7 @@ static int do_change_type(struct path *path, int ms_flags) * * Returns true if the mount tree can be copied, false otherwise. */ -static inline bool may_copy_tree(struct path *path) +static inline bool may_copy_tree(const struct path *path) { struct mount *mnt = real_mount(path->mnt); const struct dentry_operations *d_op; @@ -2931,7 +2951,7 @@ static inline bool may_copy_tree(struct path *path) } -static struct mount *__do_loopback(struct path *old_path, int recurse) +static struct mount *__do_loopback(const struct path *old_path, int recurse) { struct mount *old = real_mount(old_path->mnt); @@ -2953,12 +2973,11 @@ static struct mount *__do_loopback(struct path *old_path, int recurse) /* * do loopback mount. */ -static int do_loopback(struct path *path, const char *old_name, - int recurse) +static int do_loopback(const struct path *path, const char *old_name, + int recurse) { - struct path old_path; - struct mount *mnt = NULL, *parent; - struct pinned_mountpoint mp = {}; + struct path old_path __free(path_put) = {}; + struct mount *mnt = NULL; int err; if (!old_name || !*old_name) return -EINVAL; @@ -2966,49 +2985,40 @@ static int do_loopback(struct path *path, const char *old_name, if (err) return err; - err = -EINVAL; if (mnt_ns_loop(old_path.dentry)) - goto out; + return -EINVAL; - err = lock_mount(path, &mp); - if (err) - goto out; + LOCK_MOUNT(mp, path); + if (IS_ERR(mp.parent)) + return PTR_ERR(mp.parent); - parent = real_mount(path->mnt); - if (!check_mnt(parent)) - goto out2; + if (!check_mnt(mp.parent)) + return -EINVAL; mnt = __do_loopback(&old_path, recurse); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); - goto out2; - } + if (IS_ERR(mnt)) + return PTR_ERR(mnt); - err = graft_tree(mnt, parent, mp.mp); + err = graft_tree(mnt, &mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } -out2: - unlock_mount(&mp); -out: - path_put(&old_path); return err; } -static struct file *open_detached_copy(struct path *path, bool recursive) +static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive) { struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; struct user_namespace *user_ns = mnt_ns->user_ns; struct mount *mnt, *p; - struct file *file; ns = alloc_mnt_ns(user_ns, true); if (IS_ERR(ns)) - return ERR_CAST(ns); + return ns; - namespace_lock(); + guard(namespace_excl)(); /* * Record the sequence number of the source mount namespace. @@ -3025,23 +3035,28 @@ static struct file *open_detached_copy(struct path *path, bool recursive) mnt = __do_loopback(path, recursive); if (IS_ERR(mnt)) { - namespace_unlock(); - free_mnt_ns(ns); + emptied_ns = ns; return ERR_CAST(mnt); } - lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { mnt_add_to_ns(ns, p); ns->nr_mounts++; } ns->root = mnt; - mntget(&mnt->mnt); - unlock_mount_hash(); - namespace_unlock(); + return ns; +} + +static struct file *open_detached_copy(struct path *path, bool recursive) +{ + struct mnt_namespace *ns = get_detached_copy(path, recursive); + struct file *file; + + if (IS_ERR(ns)) + return ERR_CAST(ns); mntput(path->mnt); - path->mnt = &mnt->mnt; + path->mnt = mntget(&ns->root->mnt); file = dentry_open(path, O_PATH, current_cred()); if (IS_ERR(file)) dissolve_on_fput(path->mnt); @@ -3158,7 +3173,8 @@ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) touch_mnt_namespace(mnt->mnt_ns); } -static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) +static void mnt_warn_timestamp_expiry(const struct path *mountpoint, + struct vfsmount *mnt) { struct super_block *sb = mnt->mnt_sb; @@ -3192,7 +3208,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND * to mount(2). */ -static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) +static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags) { struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); @@ -3229,7 +3245,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ -static int do_remount(struct path *path, int sb_flags, +static int do_remount(const struct path *path, int sb_flags, int mnt_flags, void *data) { int err; @@ -3287,49 +3303,46 @@ static inline int tree_contains_unbindable(struct mount *mnt) return 0; } -static int do_set_group(struct path *from_path, struct path *to_path) +static int do_set_group(const struct path *from_path, const struct path *to_path) { - struct mount *from, *to; + struct mount *from = real_mount(from_path->mnt); + struct mount *to = real_mount(to_path->mnt); int err; - from = real_mount(from_path->mnt); - to = real_mount(to_path->mnt); - - namespace_lock(); + guard(namespace_excl)(); err = may_change_propagation(from); if (err) - goto out; + return err; err = may_change_propagation(to); if (err) - goto out; + return err; - err = -EINVAL; /* To and From paths should be mount roots */ if (!path_mounted(from_path)) - goto out; + return -EINVAL; if (!path_mounted(to_path)) - goto out; + return -EINVAL; /* Setting sharing groups is only allowed across same superblock */ if (from->mnt.mnt_sb != to->mnt.mnt_sb) - goto out; + return -EINVAL; /* From mount root should be wider than To mount root */ if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root)) - goto out; + return -EINVAL; /* From mount should not have locked children in place of To's root */ if (__has_locked_children(from, to->mnt.mnt_root)) - goto out; + return -EINVAL; /* Setting sharing groups is only allowed on private mounts */ if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to)) - goto out; + return -EINVAL; /* From should not be private */ if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from)) - goto out; + return -EINVAL; if (IS_MNT_SLAVE(from)) { hlist_add_behind(&to->mnt_slave, &from->mnt_slave); @@ -3341,11 +3354,7 @@ static int do_set_group(struct path *from_path, struct path *to_path) list_add(&to->mnt_share, &from->mnt_share); set_mnt_shared(to); } - - err = 0; -out: - namespace_unlock(); - return err; + return 0; } /** @@ -3389,17 +3398,15 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) /** * can_move_mount_beneath - check that we can mount beneath the top mount - * @from: mount to mount beneath - * @to: mount under which to mount - * @mp: mountpoint of @to + * @mnt_from: mount we are trying to move + * @mnt_to: mount under which to mount + * @mp: mountpoint of @mnt_to * - * - Make sure that @to->dentry is actually the root of a mount under - * which we can mount another mount. * - Make sure that nothing can be mounted beneath the caller's current * root or the rootfs of the namespace. * - Make sure that the caller can unmount the topmost mount ensuring * that the caller could reveal the underlying mountpoint. - * - Ensure that nothing has been mounted on top of @from before we + * - Ensure that nothing has been mounted on top of @mnt_from before we * grabbed @namespace_sem to avoid creating pointless shadow mounts. * - Prevent mounting beneath a mount if the propagation relationship * between the source mount, parent mount, and top mount would lead to @@ -3408,25 +3415,17 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) * Context: This function expects namespace_lock() to be held. * Return: On success 0, and on error a negative error code is returned. */ -static int can_move_mount_beneath(const struct path *from, - const struct path *to, +static int can_move_mount_beneath(const struct mount *mnt_from, + const struct mount *mnt_to, const struct mountpoint *mp) { - struct mount *mnt_from = real_mount(from->mnt), - *mnt_to = real_mount(to->mnt), - *parent_mnt_to = mnt_to->mnt_parent; - - if (!mnt_has_parent(mnt_to)) - return -EINVAL; - - if (!path_mounted(to)) - return -EINVAL; + struct mount *parent_mnt_to = mnt_to->mnt_parent; if (IS_MNT_LOCKED(mnt_to)) return -EINVAL; /* Avoid creating shadow mounts during mount propagation. */ - if (path_overmounted(from)) + if (mnt_from->overmount) return -EINVAL; /* @@ -3517,97 +3516,83 @@ static inline bool may_use_mount(struct mount *mnt) return check_anonymous_mnt(mnt); } -static int do_move_mount(struct path *old_path, - struct path *new_path, enum mnt_tree_flags_t flags) +static int do_move_mount(const struct path *old_path, + const struct path *new_path, + enum mnt_tree_flags_t flags) { - struct mnt_namespace *ns; - struct mount *p; - struct mount *old; - struct mount *parent; - struct pinned_mountpoint mp; + struct mount *old = real_mount(old_path->mnt); int err; bool beneath = flags & MNT_TREE_BENEATH; - err = do_lock_mount(new_path, &mp, beneath); - if (err) - return err; + if (!path_mounted(old_path)) + return -EINVAL; - old = real_mount(old_path->mnt); - p = real_mount(new_path->mnt); - parent = old->mnt_parent; - ns = old->mnt_ns; + if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) + return -EINVAL; - err = -EINVAL; + LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath); + if (IS_ERR(mp.parent)) + return PTR_ERR(mp.parent); if (check_mnt(old)) { /* if the source is in our namespace... */ /* ... it should be detachable from parent */ if (!mnt_has_parent(old) || IS_MNT_LOCKED(old)) - goto out; + return -EINVAL; + /* ... which should not be shared */ + if (IS_MNT_SHARED(old->mnt_parent)) + return -EINVAL; /* ... and the target should be in our namespace */ - if (!check_mnt(p)) - goto out; - /* parent of the source should not be shared */ - if (IS_MNT_SHARED(parent)) - goto out; + if (!check_mnt(mp.parent)) + return -EINVAL; } else { /* * otherwise the source must be the root of some anon namespace. */ if (!anon_ns_root(old)) - goto out; + return -EINVAL; /* * Bail out early if the target is within the same namespace - * subsequent checks would've rejected that, but they lose * some corner cases if we check it early. */ - if (ns == p->mnt_ns) - goto out; + if (old->mnt_ns == mp.parent->mnt_ns) + return -EINVAL; /* * Target should be either in our namespace or in an acceptable * anon namespace, sensu check_anonymous_mnt(). */ - if (!may_use_mount(p)) - goto out; + if (!may_use_mount(mp.parent)) + return -EINVAL; } - if (!path_mounted(old_path)) - goto out; - - if (d_is_dir(new_path->dentry) != - d_is_dir(old_path->dentry)) - goto out; - if (beneath) { - err = can_move_mount_beneath(old_path, new_path, mp.mp); - if (err) - goto out; + struct mount *over = real_mount(new_path->mnt); - err = -EINVAL; - p = p->mnt_parent; + if (mp.parent != over->mnt_parent) + over = mp.parent->overmount; + err = can_move_mount_beneath(old, over, mp.mp); + if (err) + return err; } /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ - if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) - goto out; - err = -ELOOP; + if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(old)) + return -EINVAL; if (!check_for_nsfs_mounts(old)) - goto out; - if (mount_is_ancestor(old, p)) - goto out; + return -ELOOP; + if (mount_is_ancestor(old, mp.parent)) + return -ELOOP; - err = attach_recursive_mnt(old, p, mp.mp); -out: - unlock_mount(&mp); - return err; + return attach_recursive_mnt(old, &mp); } -static int do_move_mount_old(struct path *path, const char *old_name) +static int do_move_mount_old(const struct path *path, const char *old_name) { - struct path old_path; + struct path old_path __free(path_put) = {}; int err; if (!old_name || !*old_name) @@ -3617,18 +3602,19 @@ static int do_move_mount_old(struct path *path, const char *old_name) if (err) return err; - err = do_move_mount(&old_path, path, 0); - path_put(&old_path); - return err; + return do_move_mount(&old_path, path, 0); } /* * add a mount into a namespace's mount tree */ -static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, - const struct path *path, int mnt_flags) +static int do_add_mount(struct mount *newmnt, const struct pinned_mountpoint *mp, + int mnt_flags) { - struct mount *parent = real_mount(path->mnt); + struct mount *parent = mp->parent; + + if (IS_ERR(parent)) + return PTR_ERR(parent); mnt_flags &= ~MNT_INTERNAL_FLAGS; @@ -3642,14 +3628,15 @@ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, } /* Refuse the same filesystem on the same mount point */ - if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path)) + if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb && + parent->mnt.mnt_root == mp->mp->m_dentry) return -EBUSY; if (d_is_symlink(newmnt->mnt.mnt_root)) return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; - return graft_tree(newmnt, parent, mp); + return graft_tree(newmnt, mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); @@ -3658,41 +3645,32 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags * Create a new mount using a superblock configuration and request it * be added to the namespace tree. */ -static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, +static int do_new_mount_fc(struct fs_context *fc, const struct path *mountpoint, unsigned int mnt_flags) { - struct vfsmount *mnt; - struct pinned_mountpoint mp = {}; - struct super_block *sb = fc->root->d_sb; + struct super_block *sb; + struct vfsmount *mnt __free(mntput) = fc_mount(fc); int error; - error = security_sb_kern_mount(sb); - if (!error && mount_too_revealing(sb, &mnt_flags)) { - errorfcp(fc, "VFS", "Mount too revealing"); - error = -EPERM; - } + if (IS_ERR(mnt)) + return PTR_ERR(mnt); - if (unlikely(error)) { - fc_drop_locked(fc); + sb = fc->root->d_sb; + error = security_sb_kern_mount(sb); + if (unlikely(error)) return error; - } - up_write(&sb->s_umount); - - mnt = vfs_create_mount(fc); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); + if (unlikely(mount_too_revealing(sb, &mnt_flags))) { + errorfcp(fc, "VFS", "Mount too revealing"); + return -EPERM; + } mnt_warn_timestamp_expiry(mountpoint, mnt); - error = lock_mount(mountpoint, &mp); - if (!error) { - error = do_add_mount(real_mount(mnt), mp.mp, - mountpoint, mnt_flags); - unlock_mount(&mp); - } - if (error < 0) - mntput(mnt); + LOCK_MOUNT(mp, mountpoint); + error = do_add_mount(real_mount(mnt), &mp, mnt_flags); + if (!error) + retain_and_null_ptr(mnt); // consumed on success return error; } @@ -3700,8 +3678,9 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, * create a new mount for userspace and request it to be added into the * namespace's tree */ -static int do_new_mount(struct path *path, const char *fstype, int sb_flags, - int mnt_flags, const char *name, void *data) +static int do_new_mount(const struct path *path, const char *fstype, + int sb_flags, int mnt_flags, + const char *name, void *data) { struct file_system_type *type; struct fs_context *fc; @@ -3747,18 +3726,38 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, if (!err && !mount_capable(fc)) err = -EPERM; if (!err) - err = vfs_get_tree(fc); - if (!err) err = do_new_mount_fc(fc, path, mnt_flags); put_fs_context(fc); return err; } -int finish_automount(struct vfsmount *m, const struct path *path) +static void lock_mount_exact(const struct path *path, + struct pinned_mountpoint *mp) { struct dentry *dentry = path->dentry; - struct pinned_mountpoint mp = {}; + int err; + + inode_lock(dentry->d_inode); + namespace_lock(); + if (unlikely(cant_mount(dentry))) + err = -ENOENT; + else if (path_overmounted(path)) + err = -EBUSY; + else + err = get_mountpoint(dentry, mp); + if (unlikely(err)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + mp->parent = ERR_PTR(err); + } else { + mp->parent = real_mount(path->mnt); + } +} + +int finish_automount(struct vfsmount *__m, const struct path *path) +{ + struct vfsmount *m __free(mntput) = __m; struct mount *mnt; int err; @@ -3769,43 +3768,21 @@ int finish_automount(struct vfsmount *m, const struct path *path) mnt = real_mount(m); - if (m->mnt_sb == path->mnt->mnt_sb && - m->mnt_root == dentry) { - err = -ELOOP; - goto discard; - } + if (m->mnt_root == path->dentry) + return -ELOOP; /* - * we don't want to use lock_mount() - in this case finding something + * we don't want to use LOCK_MOUNT() - in this case finding something * that overmounts our mountpoint to be means "quitely drop what we've * got", not "try to mount it on top". */ - inode_lock(dentry->d_inode); - namespace_lock(); - if (unlikely(cant_mount(dentry))) { - err = -ENOENT; - goto discard_locked; - } - if (path_overmounted(path)) { - err = 0; - goto discard_locked; - } - err = get_mountpoint(dentry, &mp); - if (err) - goto discard_locked; - - err = do_add_mount(mnt, mp.mp, path, - path->mnt->mnt_flags | MNT_SHRINKABLE); - unlock_mount(&mp); - if (unlikely(err)) - goto discard; - return 0; + LOCK_MOUNT_EXACT(mp, path); + if (mp.parent == ERR_PTR(-EBUSY)) + return 0; -discard_locked: - namespace_unlock(); - inode_unlock(dentry->d_inode); -discard: - mntput(m); + err = do_add_mount(mnt, &mp, path->mnt->mnt_flags | MNT_SHRINKABLE); + if (likely(!err)) + retain_and_null_ptr(m); return err; } @@ -3816,9 +3793,8 @@ discard: */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { - read_seqlock_excl(&mount_lock); + guard(mount_locked_reader)(); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); - read_sequnlock_excl(&mount_lock); } EXPORT_SYMBOL(mnt_set_expiry); @@ -3835,8 +3811,8 @@ void mark_mounts_for_expiry(struct list_head *mounts) if (list_empty(mounts)) return; - namespace_lock(); - lock_mount_hash(); + guard(namespace_excl)(); + guard(mount_writer)(); /* extract from the expiration list every vfsmount that matches the * following criteria: @@ -3858,8 +3834,6 @@ void mark_mounts_for_expiry(struct list_head *mounts) touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); } - unlock_mount_hash(); - namespace_unlock(); } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); @@ -3987,7 +3961,7 @@ static char *copy_mount_string(const void __user *data) * Therefore, if this magic number is present, it carries no information * and must be discarded. */ -int path_mount(const char *dev_name, struct path *path, +int path_mount(const char *dev_name, const struct path *path, const char *type_page, unsigned long flags, void *data_page) { unsigned int mnt_flags = 0, sb_flags; @@ -4069,15 +4043,13 @@ int path_mount(const char *dev_name, struct path *path, int do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { - struct path path; + struct path path __free(path_put) = {}; int ret; ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); if (ret) return ret; - ret = path_mount(dev_name, &path, type_page, flags, data_page); - path_put(&path); - return ret; + return path_mount(dev_name, &path, type_page, flags, data_page); } static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) @@ -4138,7 +4110,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; - struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; + struct vfsmount *rootmnt __free(mntput) = NULL; + struct vfsmount *pwdmnt __free(mntput) = NULL; struct mount *p, *q; struct mount *old; struct mount *new; @@ -4157,23 +4130,19 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, if (IS_ERR(new_ns)) return new_ns; - namespace_lock(); + guard(namespace_excl)(); /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) copy_flags |= CL_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { - namespace_unlock(); - ns_common_free(ns); - dec_mnt_namespaces(new_ns->ucounts); - mnt_ns_release(new_ns); + emptied_ns = new_ns; return ERR_CAST(new); } if (user_ns != ns->user_ns) { - lock_mount_hash(); + guard(mount_writer)(); lock_mnt_tree(new); - unlock_mount_hash(); } new_ns->root = new; @@ -4205,13 +4174,6 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } - namespace_unlock(); - - if (rootmnt) - mntput(rootmnt); - if (pwdmnt) - mntput(pwdmnt); - ns_tree_add_raw(new_ns); return new_ns; } @@ -4436,7 +4398,8 @@ err_unlock: return ret; } -static inline int vfs_move_mount(struct path *from_path, struct path *to_path, +static inline int vfs_move_mount(const struct path *from_path, + const struct path *to_path, enum mnt_tree_flags_t mflags) { int ret; @@ -4542,7 +4505,7 @@ SYSCALL_DEFINE5(move_mount, /* * Return true if path is reachable from root * - * namespace_sem or mount_lock is held + * locks: mount_locked_reader || namespace_shared && is_mounted(mnt) */ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, const struct path *root) @@ -4556,11 +4519,8 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, bool path_is_under(const struct path *path1, const struct path *path2) { - bool res; - read_seqlock_excl(&mount_lock); - res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); - read_sequnlock_excl(&mount_lock); - return res; + guard(mount_locked_reader)(); + return is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); } EXPORT_SYMBOL(path_is_under); @@ -4592,9 +4552,10 @@ EXPORT_SYMBOL(path_is_under); SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { - struct path new, old, root; + struct path new __free(path_put) = {}; + struct path old __free(path_put) = {}; + struct path root __free(path_put) = {}; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; - struct pinned_mountpoint old_mp = {}; int error; if (!may_mount()) @@ -4603,57 +4564,54 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = user_path_at(AT_FDCWD, new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); if (error) - goto out0; + return error; error = user_path_at(AT_FDCWD, put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); if (error) - goto out1; + return error; error = security_sb_pivotroot(&old, &new); if (error) - goto out2; + return error; get_fs_root(current->fs, &root); - error = lock_mount(&old, &old_mp); - if (error) - goto out3; - error = -EINVAL; + LOCK_MOUNT(old_mp, &old); + old_mnt = old_mp.parent; + if (IS_ERR(old_mnt)) + return PTR_ERR(old_mnt); + new_mnt = real_mount(new.mnt); root_mnt = real_mount(root.mnt); - old_mnt = real_mount(old.mnt); ex_parent = new_mnt->mnt_parent; root_parent = root_mnt->mnt_parent; if (IS_MNT_SHARED(old_mnt) || IS_MNT_SHARED(ex_parent) || IS_MNT_SHARED(root_parent)) - goto out4; + return -EINVAL; if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) - goto out4; + return -EINVAL; if (new_mnt->mnt.mnt_flags & MNT_LOCKED) - goto out4; - error = -ENOENT; + return -EINVAL; if (d_unlinked(new.dentry)) - goto out4; - error = -EBUSY; + return -ENOENT; if (new_mnt == root_mnt || old_mnt == root_mnt) - goto out4; /* loop, on the same file system */ - error = -EINVAL; + return -EBUSY; /* loop, on the same file system */ if (!path_mounted(&root)) - goto out4; /* not a mountpoint */ + return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) - goto out4; /* absolute root */ + return -EINVAL; /* absolute root */ if (!path_mounted(&new)) - goto out4; /* not a mountpoint */ + return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) - goto out4; /* absolute root */ + return -EINVAL; /* absolute root */ /* make sure we can reach put_old from new_root */ - if (!is_path_reachable(old_mnt, old.dentry, &new)) - goto out4; + if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new)) + return -EINVAL; /* make certain new is below the root */ if (!is_path_reachable(new_mnt, new.dentry, &root)) - goto out4; + return -EINVAL; lock_mount_hash(); umount_mnt(new_mnt); if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { @@ -4672,17 +4630,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, mnt_notify_add(root_mnt); mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); - error = 0; -out4: - unlock_mount(&old_mp); -out3: - path_put(&root); -out2: - path_put(&old); -out1: - path_put(&new); -out0: - return error; + return 0; } static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) @@ -4772,8 +4720,10 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) if (!mnt_allow_writers(kattr, m)) { err = mnt_hold_writers(m); - if (err) + if (err) { + m = next_mnt(m, mnt); break; + } } if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) @@ -4781,25 +4731,9 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) } if (err) { - struct mount *p; - - /* - * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will - * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all - * mounts and needs to take care to include the first mount. - */ - for (p = mnt; p; p = next_mnt(p, mnt)) { - /* If we had to hold writers unblock them. */ - if (p->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt_unhold_writers(p); - - /* - * We're done once the first mount we changed got - * MNT_WRITE_HOLD unset. - */ - if (p == m) - break; - } + /* undo all mnt_hold_writers() we'd done */ + for (struct mount *p = mnt; p != m; p = next_mnt(p, mnt)) + mnt_unhold_writers(p); } return err; } @@ -4830,8 +4764,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) WRITE_ONCE(m->mnt.mnt_flags, flags); /* If we had to hold writers unblock them. */ - if (m->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt_unhold_writers(m); + mnt_unhold_writers(m); if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); @@ -4841,7 +4774,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) touch_mnt_namespace(mnt->mnt_ns); } -static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) +static int do_mount_setattr(const struct path *path, struct mount_kattr *kattr) { struct mount *mnt = real_mount(path->mnt); int err = 0; @@ -5639,6 +5572,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) STATMOUNT_MNT_UIDMAP | \ STATMOUNT_MNT_GIDMAP) +/* locks: namespace_shared */ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { @@ -5885,7 +5819,7 @@ retry: if (ret) return ret; - scoped_guard(rwsem_read, &namespace_sem) + scoped_guard(namespace_shared) ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns); if (!ret) @@ -5906,6 +5840,7 @@ struct klistmount { struct path root; }; +/* locks: namespace_shared */ static ssize_t do_listmount(struct klistmount *kls, bool reverse) { struct mnt_namespace *ns = kls->ns; @@ -6040,7 +5975,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, * We only need to guard against mount topology changes as * listmount() doesn't care about any mount properties. */ - scoped_guard(rwsem_read, &namespace_sem) + scoped_guard(namespace_shared) ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE)); if (ret <= 0) return ret; @@ -6127,12 +6062,10 @@ void put_mnt_ns(struct mnt_namespace *ns) { if (!ns_ref_put(ns)) return; - namespace_lock(); + guard(namespace_excl)(); emptied_ns = ns; - lock_mount_hash(); + guard(mount_writer)(); umount_tree(ns->root, 0); - unlock_mount_hash(); - namespace_unlock(); } struct vfsmount *kern_mount(struct file_system_type *type) @@ -6181,25 +6114,18 @@ bool our_mnt(struct vfsmount *mnt) bool current_chrooted(void) { /* Does the current process have a non-standard root */ - struct path ns_root; - struct path fs_root; - bool chrooted; - - /* Find the namespace root */ - ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; - ns_root.dentry = ns_root.mnt->mnt_root; - path_get(&ns_root); - while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) - ; + struct path fs_root __free(path_put) = {}; + struct mount *root; get_fs_root(current->fs, &fs_root); - chrooted = !path_equal(&fs_root, &ns_root); + /* Find the namespace root */ + + guard(mount_locked_reader)(); - path_put(&fs_root); - path_put(&ns_root); + root = topmost_overmount(current->nsproxy->mnt_ns->root); - return chrooted; + return fs_root.mnt != &root->mnt || !path_mounted(&fs_root); } static bool mnt_already_visible(struct mnt_namespace *ns, @@ -6208,9 +6134,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns, { int new_flags = *new_mnt_flags; struct mount *mnt, *n; - bool visible = false; - down_read(&namespace_sem); + guard(namespace_shared)(); rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { struct mount *child; int mnt_flags; @@ -6257,13 +6182,10 @@ static bool mnt_already_visible(struct mnt_namespace *ns, /* Preserve the locked attributes */ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ MNT_LOCK_ATIME); - visible = true; - goto found; + return true; next: ; } -found: - up_read(&namespace_sem); - return visible; + return false; } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) diff --git a/fs/pnode.c b/fs/pnode.c index 6f7d02f3fa98..5d91c3e58d2a 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -29,6 +29,7 @@ static inline struct mount *next_slave(struct mount *p) return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave); } +/* locks: namespace_shared && is_mounted(mnt) */ static struct mount *get_peer_under_root(struct mount *mnt, struct mnt_namespace *ns, const struct path *root) @@ -50,7 +51,7 @@ static struct mount *get_peer_under_root(struct mount *mnt, * Get ID of closest dominating peer group having a representative * under the given root. * - * Caller must hold namespace_sem + * locks: namespace_shared */ int get_dominating_id(struct mount *mnt, const struct path *root) { @@ -70,19 +71,6 @@ static inline bool will_be_unmounted(struct mount *m) return m->mnt.mnt_flags & MNT_UMOUNT; } -static struct mount *propagation_source(struct mount *mnt) -{ - do { - struct mount *m; - for (m = next_peer(mnt); m != mnt; m = next_peer(m)) { - if (!will_be_unmounted(m)) - return m; - } - mnt = mnt->mnt_master; - } while (mnt && will_be_unmounted(mnt)); - return mnt; -} - static void transfer_propagation(struct mount *mnt, struct mount *to) { struct hlist_node *p = NULL, *n; @@ -111,11 +99,10 @@ void change_mnt_propagation(struct mount *mnt, int type) return; } if (IS_MNT_SHARED(mnt)) { - if (type == MS_SLAVE || !hlist_empty(&mnt->mnt_slave_list)) - m = propagation_source(mnt); if (list_empty(&mnt->mnt_share)) { mnt_release_group_id(mnt); } else { + m = next_peer(mnt); list_del_init(&mnt->mnt_share); mnt->mnt_group_id = 0; } @@ -136,6 +123,57 @@ void change_mnt_propagation(struct mount *mnt, int type) } } +static struct mount *trace_transfers(struct mount *m) +{ + while (1) { + struct mount *next = next_peer(m); + + if (next != m) { + list_del_init(&m->mnt_share); + m->mnt_group_id = 0; + m->mnt_master = next; + } else { + if (IS_MNT_SHARED(m)) + mnt_release_group_id(m); + next = m->mnt_master; + } + hlist_del_init(&m->mnt_slave); + CLEAR_MNT_SHARED(m); + SET_MNT_MARK(m); + + if (!next || !will_be_unmounted(next)) + return next; + if (IS_MNT_MARKED(next)) + return next->mnt_master; + m = next; + } +} + +static void set_destinations(struct mount *m, struct mount *master) +{ + struct mount *next; + + while ((next = m->mnt_master) != master) { + m->mnt_master = master; + m = next; + } +} + +void bulk_make_private(struct list_head *set) +{ + struct mount *m; + + list_for_each_entry(m, set, mnt_list) + if (!IS_MNT_MARKED(m)) + set_destinations(m, trace_transfers(m)); + + list_for_each_entry(m, set, mnt_list) { + transfer_propagation(m, m->mnt_master); + m->mnt_master = NULL; + CLEAR_MNT_MARK(m); + } +} + static struct mount *__propagation_next(struct mount *m, struct mount *origin) { @@ -304,9 +342,8 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, err = PTR_ERR(this); break; } - read_seqlock_excl(&mount_lock); - mnt_set_mountpoint(n, dest_mp, this); - read_sequnlock_excl(&mount_lock); + scoped_guard(mount_locked_reader) + mnt_set_mountpoint(n, dest_mp, this); if (n->mnt_master) SET_MNT_MARK(n->mnt_master); copy = this; diff --git a/fs/pnode.h b/fs/pnode.h index 00ab153e3e9d..b029db225f33 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -42,6 +42,7 @@ static inline bool peers(const struct mount *m1, const struct mount *m2) } void change_mnt_propagation(struct mount *, int); +void bulk_make_private(struct list_head *); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct hlist_head *); void propagate_umount(struct list_head *); diff --git a/fs/super.c b/fs/super.c index f4fa0e93c463..5bab94fb7e03 100644 --- a/fs/super.c +++ b/fs/super.c @@ -323,7 +323,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, if (!s) return NULL; - INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -408,7 +407,7 @@ static void __put_super(struct super_block *s) list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); - WARN_ON(!list_empty(&s->s_mounts)); + WARN_ON(s->s_mounts); call_rcu(&s->rcu, destroy_super_rcu); } } diff --git a/include/linux/fs.h b/include/linux/fs.h index 75fb216b0f7a..d3c023ff1a86 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1434,6 +1434,8 @@ struct sb_writers { struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; +struct mount; + struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ @@ -1468,7 +1470,7 @@ struct super_block { __u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ - struct list_head s_mounts; /* list of mounts; _not_ for fs use */ + struct mount *s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ struct file *s_bdev_file; struct backing_dev_info *s_bdi; diff --git a/include/linux/mount.h b/include/linux/mount.h index 5f9c053b0897..acfe7ef86a1b 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -33,7 +33,6 @@ enum mount_flags { MNT_NOSYMFOLLOW = 0x80, MNT_SHRINKABLE = 0x100, - MNT_WRITE_HOLD = 0x200, MNT_INTERNAL = 0x4000, @@ -52,7 +51,7 @@ enum mount_flags { | MNT_READONLY | MNT_NOSYMFOLLOW, MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME, - MNT_INTERNAL_FLAGS = MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | + MNT_INTERNAL_FLAGS = MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_LOCKED }; @@ -77,7 +76,7 @@ extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); extern void mnt_make_shortterm(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(const struct path *path); -extern bool __mnt_is_readonly(struct vfsmount *mnt); +extern bool __mnt_is_readonly(const struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); extern struct vfsmount *clone_private_mount(const struct path *path); @@ -104,8 +103,8 @@ extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); -extern struct path *collect_paths(const struct path *, struct path *, unsigned); -extern void drop_collected_paths(struct path *, struct path *); +extern const struct path *collect_paths(const struct path *, struct path *, unsigned); +extern void drop_collected_paths(const struct path *, const struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 1605df0a171e..fda6beb041e0 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -680,7 +680,7 @@ void audit_trim_trees(void) struct audit_tree *tree; struct path path; struct audit_node *node; - struct path *paths; + const struct path *paths; struct path array[16]; int err; @@ -703,7 +703,7 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - for (struct path *p = paths; p->dentry; p++) { + for (const struct path *p = paths; p->dentry; p++) { struct inode *inode = p->dentry->d_inode; if (inode_to_key(inode) == chunk->key) { node->index &= ~(1U<<31); @@ -742,9 +742,9 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mounts(struct path *paths, struct audit_tree *tree) +static int tag_mounts(const struct path *paths, struct audit_tree *tree) { - for (struct path *p = paths; p->dentry; p++) { + for (const struct path *p = paths; p->dentry; p++) { int err = tag_chunk(p->dentry->d_inode, tree); if (err) return err; @@ -807,7 +807,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct audit_tree *seed = rule->tree, *tree; struct path path; struct path array[16]; - struct path *paths; + const struct path *paths; int err; rule->tree = NULL; @@ -879,7 +879,7 @@ int audit_tag_tree(char *old, char *new) int failed = 0; struct path path1, path2; struct path array[16]; - struct path *paths; + const struct path *paths; int err; err = kern_path(new, 0, &path2); |