summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/dcache.c4
-rw-r--r--fs/ecryptfs/dentry.c14
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h27
-rw-r--r--fs/ecryptfs/file.c15
-rw-r--r--fs/ecryptfs/inode.c19
-rw-r--r--fs/ecryptfs/main.c24
-rw-r--r--fs/internal.h4
-rw-r--r--fs/mount.h39
-rw-r--r--fs/namespace.c996
-rw-r--r--fs/pnode.c75
-rw-r--r--fs/pnode.h1
-rw-r--r--fs/super.c3
-rw-r--r--include/linux/fs.h4
-rw-r--r--include/linux/mount.h9
-rw-r--r--kernel/audit_tree.c12
15 files changed, 602 insertions, 644 deletions
diff --git a/fs/dcache.c b/fs/dcache.c
index 65cc11939654..b2b3b20b55f6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1390,6 +1390,7 @@ struct check_mount {
unsigned int mounted;
};
+/* locks: mount_locked_reader && dentry->d_lock */
static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
{
struct check_mount *info = data;
@@ -1416,9 +1417,8 @@ int path_has_submounts(const struct path *parent)
{
struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };
- read_seqlock_excl(&mount_lock);
+ guard(mount_locked_reader)();
d_walk(parent->dentry, &data, path_check_mount);
- read_sequnlock_excl(&mount_lock);
return data.mounted;
}
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 1dfd5b81d831..6648a924e31a 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -59,14 +59,6 @@ static int ecryptfs_d_revalidate(struct inode *dir, const struct qstr *name,
return rc;
}
-struct kmem_cache *ecryptfs_dentry_info_cache;
-
-static void ecryptfs_dentry_free_rcu(struct rcu_head *head)
-{
- kmem_cache_free(ecryptfs_dentry_info_cache,
- container_of(head, struct ecryptfs_dentry_info, rcu));
-}
-
/**
* ecryptfs_d_release
* @dentry: The ecryptfs dentry
@@ -75,11 +67,7 @@ static void ecryptfs_dentry_free_rcu(struct rcu_head *head)
*/
static void ecryptfs_d_release(struct dentry *dentry)
{
- struct ecryptfs_dentry_info *p = dentry->d_fsdata;
- if (p) {
- path_put(&p->lower_path);
- call_rcu(&p->rcu, ecryptfs_dentry_free_rcu);
- }
+ dput(dentry->d_fsdata);
}
const struct dentry_operations ecryptfs_dops = {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 1f562e75d0e4..9e6ab0b41337 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -258,13 +258,6 @@ struct ecryptfs_inode_info {
struct ecryptfs_crypt_stat crypt_stat;
};
-/* dentry private data. Each dentry must keep track of a lower
- * vfsmount too. */
-struct ecryptfs_dentry_info {
- struct path lower_path;
- struct rcu_head rcu;
-};
-
/**
* ecryptfs_global_auth_tok - A key used to encrypt all new files under the mountpoint
* @flags: Status flags
@@ -348,6 +341,7 @@ struct ecryptfs_mount_crypt_stat {
/* superblock private data. */
struct ecryptfs_sb_info {
struct super_block *wsi_sb;
+ struct vfsmount *lower_mnt;
struct ecryptfs_mount_crypt_stat mount_crypt_stat;
};
@@ -494,22 +488,25 @@ ecryptfs_set_superblock_lower(struct super_block *sb,
}
static inline void
-ecryptfs_set_dentry_private(struct dentry *dentry,
- struct ecryptfs_dentry_info *dentry_info)
+ecryptfs_set_dentry_lower(struct dentry *dentry,
+ struct dentry *lower_dentry)
{
- dentry->d_fsdata = dentry_info;
+ dentry->d_fsdata = lower_dentry;
}
static inline struct dentry *
ecryptfs_dentry_to_lower(struct dentry *dentry)
{
- return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry;
+ return dentry->d_fsdata;
}
-static inline const struct path *
-ecryptfs_dentry_to_lower_path(struct dentry *dentry)
+static inline struct path
+ecryptfs_lower_path(struct dentry *dentry)
{
- return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path;
+ return (struct path){
+ .mnt = ecryptfs_superblock_to_private(dentry->d_sb)->lower_mnt,
+ .dentry = ecryptfs_dentry_to_lower(dentry)
+ };
}
#define ecryptfs_printk(type, fmt, arg...) \
@@ -532,7 +529,6 @@ extern unsigned int ecryptfs_number_of_users;
extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
extern struct kmem_cache *ecryptfs_file_info_cache;
-extern struct kmem_cache *ecryptfs_dentry_info_cache;
extern struct kmem_cache *ecryptfs_inode_info_cache;
extern struct kmem_cache *ecryptfs_sb_info_cache;
extern struct kmem_cache *ecryptfs_header_cache;
@@ -557,7 +553,6 @@ int ecryptfs_encrypt_and_encode_filename(
size_t *encoded_name_size,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
const char *name, size_t name_size);
-struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
void ecryptfs_dump_hex(char *data, int bytes);
int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
int sg_size);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 5f8f96da09fe..7929411837cf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -33,13 +33,12 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
struct iov_iter *to)
{
ssize_t rc;
- const struct path *path;
struct file *file = iocb->ki_filp;
rc = generic_file_read_iter(iocb, to);
if (rc >= 0) {
- path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
- touch_atime(path);
+ struct path path = ecryptfs_lower_path(file->f_path.dentry);
+ touch_atime(&path);
}
return rc;
}
@@ -59,12 +58,11 @@ static ssize_t ecryptfs_splice_read_update_atime(struct file *in, loff_t *ppos,
size_t len, unsigned int flags)
{
ssize_t rc;
- const struct path *path;
rc = filemap_splice_read(in, ppos, pipe, len, flags);
if (rc >= 0) {
- path = ecryptfs_dentry_to_lower_path(in->f_path.dentry);
- touch_atime(path);
+ struct path path = ecryptfs_lower_path(in->f_path.dentry);
+ touch_atime(&path);
}
return rc;
}
@@ -283,6 +281,7 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file)
* ecryptfs_lookup() */
struct ecryptfs_file_info *file_info;
struct file *lower_file;
+ struct path path;
/* Released in ecryptfs_release or end of function if failure */
file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL);
@@ -292,8 +291,8 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file)
"Error attempting to allocate memory\n");
return -ENOMEM;
}
- lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry),
- file->f_flags, current_cred());
+ path = ecryptfs_lower_path(ecryptfs_dentry);
+ lower_file = dentry_open(&path, file->f_flags, current_cred());
if (IS_ERR(lower_file)) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index abd954c6a14e..ed1394da8d6b 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -327,24 +327,15 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
struct dentry *lower_dentry)
{
- const struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
+ struct dentry *lower_parent = ecryptfs_dentry_to_lower(dentry->d_parent);
struct inode *inode, *lower_inode;
- struct ecryptfs_dentry_info *dentry_info;
int rc = 0;
- dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
- if (!dentry_info) {
- dput(lower_dentry);
- return ERR_PTR(-ENOMEM);
- }
-
fsstack_copy_attr_atime(d_inode(dentry->d_parent),
- d_inode(path->dentry));
+ d_inode(lower_parent));
BUG_ON(!d_count(lower_dentry));
- ecryptfs_set_dentry_private(dentry, dentry_info);
- dentry_info->lower_path.mnt = mntget(path->mnt);
- dentry_info->lower_path.dentry = lower_dentry;
+ ecryptfs_set_dentry_lower(dentry, lower_dentry);
/*
* negative dentry can go positive under us here - its parent is not
@@ -1021,10 +1012,10 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap,
{
struct dentry *dentry = path->dentry;
struct kstat lower_stat;
+ struct path lower_path = ecryptfs_lower_path(dentry);
int rc;
- rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry),
- &lower_stat, request_mask, flags);
+ rc = vfs_getattr_nosec(&lower_path, &lower_stat, request_mask, flags);
if (!rc) {
fsstack_copy_attr_all(d_inode(dentry),
ecryptfs_inode_to_lower(d_inode(dentry)));
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index eab1beb846d3..16ea14dd2c62 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -106,15 +106,14 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
struct file **lower_file)
{
const struct cred *cred = current_cred();
- const struct path *path = ecryptfs_dentry_to_lower_path(dentry);
+ struct path path = ecryptfs_lower_path(dentry);
int rc;
- rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
- cred);
+ rc = ecryptfs_privileged_open(lower_file, path.dentry, path.mnt, cred);
if (rc) {
printk(KERN_ERR "Error opening lower file "
"for lower_dentry [0x%p] and lower_mnt [0x%p]; "
- "rc = [%d]\n", path->dentry, path->mnt, rc);
+ "rc = [%d]\n", path.dentry, path.mnt, rc);
(*lower_file) = NULL;
}
return rc;
@@ -437,7 +436,6 @@ static int ecryptfs_get_tree(struct fs_context *fc)
struct ecryptfs_fs_context *ctx = fc->fs_private;
struct ecryptfs_sb_info *sbi = fc->s_fs_info;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
- struct ecryptfs_dentry_info *root_info;
const char *err = "Getting sb failed";
struct inode *inode;
struct path path;
@@ -543,14 +541,8 @@ static int ecryptfs_get_tree(struct fs_context *fc)
goto out_free;
}
- rc = -ENOMEM;
- root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
- if (!root_info)
- goto out_free;
-
- /* ->kill_sb() will take care of root_info */
- ecryptfs_set_dentry_private(s->s_root, root_info);
- root_info->lower_path = path;
+ ecryptfs_set_dentry_lower(s->s_root, path.dentry);
+ ecryptfs_superblock_to_private(s)->lower_mnt = path.mnt;
s->s_flags |= SB_ACTIVE;
fc->root = dget(s->s_root);
@@ -580,6 +572,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
kill_anon_super(sb);
if (!sb_info)
return;
+ mntput(sb_info->lower_mnt);
ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
}
@@ -668,11 +661,6 @@ static struct ecryptfs_cache_info {
.size = sizeof(struct ecryptfs_file_info),
},
{
- .cache = &ecryptfs_dentry_info_cache,
- .name = "ecryptfs_dentry_info_cache",
- .size = sizeof(struct ecryptfs_dentry_info),
- },
- {
.cache = &ecryptfs_inode_info_cache,
.name = "ecryptfs_inode_cache",
.size = sizeof(struct ecryptfs_inode_info),
diff --git a/fs/internal.h b/fs/internal.h
index a33d18ee5b74..b5c62abefff4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -84,9 +84,9 @@ void mnt_put_write_access_file(struct file *file);
extern void dissolve_on_fput(struct vfsmount *);
extern bool may_mount(void);
-int path_mount(const char *dev_name, struct path *path,
+int path_mount(const char *dev_name, const struct path *path,
const char *type_page, unsigned long flags, void *data_page);
-int path_umount(struct path *path, int flags);
+int path_umount(const struct path *path, int flags);
int show_path(struct seq_file *m, struct dentry *root);
diff --git a/fs/mount.h b/fs/mount.h
index 79c85639a7ba..f13a28752d0b 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,7 +58,10 @@ struct mount {
#endif
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
- struct list_head mnt_instance; /* mount instance on sb->s_mounts */
+ struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */
+ struct mount * __aligned(1) *mnt_pprev_for_sb;
+ /* except that LSB of pprev is stolen */
+#define WRITE_HOLD 1 /* ... for use by mnt_hold_writers() */
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
struct list_head mnt_list;
struct list_head mnt_expire; /* link in fs-specific expiry list */
@@ -148,6 +151,11 @@ static inline void get_mnt_ns(struct mnt_namespace *ns)
extern seqlock_t mount_lock;
+DEFINE_LOCK_GUARD_0(mount_writer, write_seqlock(&mount_lock),
+ write_sequnlock(&mount_lock))
+DEFINE_LOCK_GUARD_0(mount_locked_reader, read_seqlock_excl(&mount_lock),
+ read_sequnlock_excl(&mount_lock))
+
struct proc_mounts {
struct mnt_namespace *ns;
struct path root;
@@ -224,4 +232,33 @@ static inline void mnt_notify_add(struct mount *m)
}
#endif
+static inline struct mount *topmost_overmount(struct mount *m)
+{
+ while (m->overmount)
+ m = m->overmount;
+ return m;
+}
+
+static inline bool __test_write_hold(struct mount * __aligned(1) *val)
+{
+ return (unsigned long)val & WRITE_HOLD;
+}
+
+static inline bool test_write_hold(const struct mount *m)
+{
+ return __test_write_hold(m->mnt_pprev_for_sb);
+}
+
+static inline void set_write_hold(struct mount *m)
+{
+ m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb
+ | WRITE_HOLD);
+}
+
+static inline void clear_write_hold(struct mount *m)
+{
+ m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb
+ & ~WRITE_HOLD);
+}
+
struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index dc01b14c58cd..d39499ab5cb5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -91,6 +91,14 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */
+static inline void namespace_lock(void);
+static void namespace_unlock(void);
+DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock())
+DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem),
+ up_read(&namespace_sem))
+
+DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T))
+
#ifdef CONFIG_FSNOTIFY
LIST_HEAD(notify_list); /* protected by namespace_sem */
#endif
@@ -363,7 +371,7 @@ out_free_cache:
* mnt_want/drop_write() will _keep_ the filesystem
* r/w.
*/
-bool __mnt_is_readonly(struct vfsmount *mnt)
+bool __mnt_is_readonly(const struct vfsmount *mnt)
{
return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
}
@@ -403,7 +411,7 @@ static unsigned int mnt_get_writers(struct mount *mnt)
#endif
}
-static int mnt_is_readonly(struct vfsmount *mnt)
+static int mnt_is_readonly(const struct vfsmount *mnt)
{
if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
return 1;
@@ -444,31 +452,31 @@ int mnt_get_write_access(struct vfsmount *m)
mnt_inc_writers(mnt);
/*
* The store to mnt_inc_writers must be visible before we pass
- * MNT_WRITE_HOLD loop below, so that the slowpath can see our
- * incremented count after it has set MNT_WRITE_HOLD.
+ * WRITE_HOLD loop below, so that the slowpath can see our
+ * incremented count after it has set WRITE_HOLD.
*/
smp_mb();
might_lock(&mount_lock.lock);
- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
+ while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) {
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
cpu_relax();
} else {
/*
* This prevents priority inversion, if the task
- * setting MNT_WRITE_HOLD got preempted on a remote
+ * setting WRITE_HOLD got preempted on a remote
* CPU, and it prevents life lock if the task setting
- * MNT_WRITE_HOLD has a lower priority and is bound to
+ * WRITE_HOLD has a lower priority and is bound to
* the same CPU as the task that is spinning here.
*/
preempt_enable();
- lock_mount_hash();
- unlock_mount_hash();
+ read_seqlock_excl(&mount_lock);
+ read_sequnlock_excl(&mount_lock);
preempt_disable();
}
}
/*
* The barrier pairs with the barrier sb_start_ro_state_change() making
- * sure that if we see MNT_WRITE_HOLD cleared, we will also see
+ * sure that if we see WRITE_HOLD cleared, we will also see
* s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
* mnt_is_readonly() and bail in case we are racing with remount
* read-only.
@@ -606,16 +614,16 @@ EXPORT_SYMBOL(mnt_drop_write_file);
* a call to mnt_unhold_writers() in order to stop preventing write access to
* @mnt.
*
- * Context: This function expects lock_mount_hash() to be held serializing
- * setting MNT_WRITE_HOLD.
+ * Context: This function expects to be in mount_locked_reader scope serializing
+ * setting WRITE_HOLD.
* Return: On success 0 is returned.
* On error, -EBUSY is returned.
*/
static inline int mnt_hold_writers(struct mount *mnt)
{
- mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
+ set_write_hold(mnt);
/*
- * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+ * After storing WRITE_HOLD, we'll read the counters. This store
* should be visible before we do.
*/
smp_mb();
@@ -631,9 +639,9 @@ static inline int mnt_hold_writers(struct mount *mnt)
* sum up each counter, if we read a counter before it is incremented,
* but then read another CPU's count which it has been subsequently
* decremented from -- we would see more decrements than we should.
- * MNT_WRITE_HOLD protects against this scenario, because
+ * WRITE_HOLD protects against this scenario, because
* mnt_want_write first increments count, then smp_mb, then spins on
- * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+ * WRITE_HOLD, so it can't be decremented by another CPU while
* we're counting up here.
*/
if (mnt_get_writers(mnt) > 0)
@@ -649,19 +657,42 @@ static inline int mnt_hold_writers(struct mount *mnt)
* Stop preventing write access to @mnt allowing callers to gain write access
* to @mnt again.
*
- * This function can only be called after a successful call to
- * mnt_hold_writers().
+ * This function can only be called after a call to mnt_hold_writers().
*
- * Context: This function expects lock_mount_hash() to be held.
+ * Context: This function expects to be in the same mount_locked_reader scope
+ * as the matching mnt_hold_writers().
*/
static inline void mnt_unhold_writers(struct mount *mnt)
{
+ if (!test_write_hold(mnt))
+ return;
/*
- * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+ * MNT_READONLY must become visible before ~WRITE_HOLD, so writers
* that become unheld will see MNT_READONLY.
*/
smp_wmb();
- mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+ clear_write_hold(mnt);
+}
+
+static inline void mnt_del_instance(struct mount *m)
+{
+ struct mount **p = m->mnt_pprev_for_sb;
+ struct mount *next = m->mnt_next_for_sb;
+
+ if (next)
+ next->mnt_pprev_for_sb = p;
+ *p = next;
+}
+
+static inline void mnt_add_instance(struct mount *m, struct super_block *s)
+{
+ struct mount *first = s->s_mounts;
+
+ if (first)
+ first->mnt_pprev_for_sb = &m->mnt_next_for_sb;
+ m->mnt_next_for_sb = first;
+ m->mnt_pprev_for_sb = &s->s_mounts;
+ s->s_mounts = m;
}
static int mnt_make_readonly(struct mount *mnt)
@@ -677,17 +708,17 @@ static int mnt_make_readonly(struct mount *mnt)
int sb_prepare_remount_readonly(struct super_block *sb)
{
- struct mount *mnt;
int err = 0;
- /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
+ /* Racy optimization. Recheck the counter under WRITE_HOLD */
if (atomic_long_read(&sb->s_remove_count))
return -EBUSY;
- lock_mount_hash();
- list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
- if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
- err = mnt_hold_writers(mnt);
+ guard(mount_locked_reader)();
+
+ for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
+ if (!(m->mnt.mnt_flags & MNT_READONLY)) {
+ err = mnt_hold_writers(m);
if (err)
break;
}
@@ -697,11 +728,10 @@ int sb_prepare_remount_readonly(struct super_block *sb)
if (!err)
sb_start_ro_state_change(sb);
- list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
- if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
- mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+ for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
+ if (test_write_hold(m))
+ clear_write_hold(m);
}
- unlock_mount_hash();
return err;
}
@@ -760,24 +790,16 @@ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
}
/**
- * __lookup_mnt - find first child mount
+ * __lookup_mnt - mount hash lookup
* @mnt: parent mount
- * @dentry: mountpoint
- *
- * If @mnt has a child mount @c mounted @dentry find and return it.
+ * @dentry: dentry of mountpoint
*
- * Note that the child mount @c need not be unique. There are cases
- * where shadow mounts are created. For example, during mount
- * propagation when a source mount @mnt whose root got overmounted by a
- * mount @o after path lookup but before @namespace_sem could be
- * acquired gets copied and propagated. So @mnt gets copied including
- * @o. When @mnt is propagated to a destination mount @d that already
- * has another mount @n mounted at the same mountpoint then the source
- * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
- * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
- * on @dentry.
+ * If @mnt has a child mount @c mounted on @dentry find and return it.
+ * Caller must either hold the spinlock component of @mount_lock or
+ * hold rcu_read_lock(), sample the seqcount component before the call
+ * and recheck it afterwards.
*
- * Return: The first child of @mnt mounted @dentry or NULL.
+ * Return: The child of @mnt mounted on @dentry or %NULL.
*/
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
@@ -790,21 +812,12 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
return NULL;
}
-/*
- * lookup_mnt - Return the first child mount mounted at path
- *
- * "First" means first mounted chronologically. If you create the
- * following mounts:
- *
- * mount /dev/sda1 /mnt
- * mount /dev/sda2 /mnt
- * mount /dev/sda3 /mnt
- *
- * Then lookup_mnt() on the base /mnt dentry in the root mount will
- * return successively the root dentry and vfsmount of /dev/sda1, then
- * /dev/sda2, then /dev/sda3, then NULL.
+/**
+ * lookup_mnt - Return the child mount mounted at given location
+ * @path: location in the namespace
*
- * lookup_mnt takes a reference to the found vfsmount.
+ * Acquires and returns a new reference to mount at given location
+ * or %NULL if nothing is mounted there.
*/
struct vfsmount *lookup_mnt(const struct path *path)
{
@@ -841,22 +854,20 @@ bool __is_local_mountpoint(const struct dentry *dentry)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct mount *mnt, *n;
- bool is_covered = false;
- down_read(&namespace_sem);
- rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
- is_covered = (mnt->mnt_mountpoint == dentry);
- if (is_covered)
- break;
- }
- up_read(&namespace_sem);
+ guard(namespace_shared)();
- return is_covered;
+ rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node)
+ if (mnt->mnt_mountpoint == dentry)
+ return true;
+
+ return false;
}
struct pinned_mountpoint {
struct hlist_node node;
struct mountpoint *mp;
+ struct mount *parent;
};
static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)
@@ -947,7 +958,7 @@ static void unpin_mountpoint(struct pinned_mountpoint *m)
}
}
-static inline int check_mnt(struct mount *mnt)
+static inline int check_mnt(const struct mount *mnt)
{
return mnt->mnt_ns == current->nsproxy->mnt_ns;
}
@@ -1149,6 +1160,20 @@ static void commit_tree(struct mount *mnt)
touch_mnt_namespace(n);
}
+static void setup_mnt(struct mount *m, struct dentry *root)
+{
+ struct super_block *s = root->d_sb;
+
+ atomic_inc(&s->s_active);
+ m->mnt.mnt_sb = s;
+ m->mnt.mnt_root = dget(root);
+ m->mnt_mountpoint = m->mnt.mnt_root;
+ m->mnt_parent = m;
+
+ guard(mount_locked_reader)();
+ mnt_add_instance(m, s);
+}
+
/**
* vfs_create_mount - Create a mount for a configured superblock
* @fc: The configuration context with the superblock attached
@@ -1172,15 +1197,8 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
if (fc->sb_flags & SB_KERNMOUNT)
mnt->mnt.mnt_flags = MNT_INTERNAL;
- atomic_inc(&fc->root->d_sb->s_active);
- mnt->mnt.mnt_sb = fc->root->d_sb;
- mnt->mnt.mnt_root = dget(fc->root);
- mnt->mnt_mountpoint = mnt->mnt.mnt_root;
- mnt->mnt_parent = mnt;
+ setup_mnt(mnt, fc->root);
- lock_mount_hash();
- list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
- unlock_mount_hash();
return &mnt->mnt;
}
EXPORT_SYMBOL(vfs_create_mount);
@@ -1238,7 +1256,6 @@ EXPORT_SYMBOL_GPL(vfs_kern_mount);
static struct mount *clone_mnt(struct mount *old, struct dentry *root,
int flag)
{
- struct super_block *sb = old->mnt.mnt_sb;
struct mount *mnt;
int err;
@@ -1263,16 +1280,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
if (mnt->mnt_group_id)
set_mnt_shared(mnt);
- atomic_inc(&sb->s_active);
mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
- mnt->mnt.mnt_sb = sb;
- mnt->mnt.mnt_root = dget(root);
- mnt->mnt_mountpoint = mnt->mnt.mnt_root;
- mnt->mnt_parent = mnt;
- lock_mount_hash();
- list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
- unlock_mount_hash();
+ setup_mnt(mnt, root);
if (flag & CL_PRIVATE) // we are done with it
return mnt;
@@ -1378,7 +1388,7 @@ static void mntput_no_expire(struct mount *mnt)
mnt->mnt.mnt_flags |= MNT_DOOMED;
rcu_read_unlock();
- list_del(&mnt->mnt_instance);
+ mnt_del_instance(mnt);
if (unlikely(!list_empty(&mnt->mnt_expire)))
list_del(&mnt->mnt_expire);
@@ -1719,8 +1729,6 @@ static inline void namespace_lock(void)
down_write(&namespace_sem);
}
-DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock())
-
enum umount_tree_flags {
UMOUNT_SYNC = 1,
UMOUNT_PROPAGATE = 2,
@@ -1785,6 +1793,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
if (how & UMOUNT_PROPAGATE)
propagate_umount(&tmp_list);
+ bulk_make_private(&tmp_list);
+
while (!list_empty(&tmp_list)) {
struct mnt_namespace *ns;
bool disconnect;
@@ -1809,7 +1819,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
umount_mnt(p);
}
}
- change_mnt_propagation(p, MS_PRIVATE);
if (disconnect)
hlist_add_head(&p->mnt_umount, &unmounted);
@@ -1969,10 +1978,11 @@ void __detach_mounts(struct dentry *dentry)
struct pinned_mountpoint mp = {};
struct mount *mnt;
- namespace_lock();
- lock_mount_hash();
+ guard(namespace_excl)();
+ guard(mount_writer)();
+
if (!lookup_mountpoint(dentry, &mp))
- goto out_unlock;
+ return;
event++;
while (mp.node.next) {
@@ -1984,9 +1994,6 @@ void __detach_mounts(struct dentry *dentry)
else umount_tree(mnt, UMOUNT_CONNECTED);
}
unpin_mountpoint(&mp);
-out_unlock:
- unlock_mount_hash();
- namespace_unlock();
}
/*
@@ -2025,7 +2032,7 @@ static int can_umount(const struct path *path, int flags)
}
// caller is responsible for flags being sane
-int path_umount(struct path *path, int flags)
+int path_umount(const struct path *path, int flags)
{
struct mount *mnt = real_mount(path->mnt);
int ret;
@@ -2238,7 +2245,7 @@ static inline bool extend_array(struct path **res, struct path **to_free,
return p;
}
-struct path *collect_paths(const struct path *path,
+const struct path *collect_paths(const struct path *path,
struct path *prealloc, unsigned count)
{
struct mount *root = real_mount(path->mnt);
@@ -2246,7 +2253,7 @@ struct path *collect_paths(const struct path *path,
struct path *res = prealloc, *to_free = NULL;
unsigned n = 0;
- guard(rwsem_read)(&namespace_sem);
+ guard(namespace_shared)();
if (!check_mnt(root))
return ERR_PTR(-EINVAL);
@@ -2272,9 +2279,9 @@ struct path *collect_paths(const struct path *path,
return res;
}
-void drop_collected_paths(struct path *paths, struct path *prealloc)
+void drop_collected_paths(const struct path *paths, const struct path *prealloc)
{
- for (struct path *p = paths; p->mnt; p++)
+ for (const struct path *p = paths; p->mnt; p++)
path_put(p);
if (paths != prealloc)
kfree(paths);
@@ -2301,7 +2308,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
return;
}
- scoped_guard(namespace_lock, &namespace_sem) {
+ scoped_guard(namespace_excl) {
if (!anon_ns_root(m))
return;
@@ -2312,6 +2319,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
}
}
+/* locks: namespace_shared && pinned(mnt) || mount_locked_reader */
static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
{
struct mount *child;
@@ -2328,12 +2336,8 @@ static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
bool has_locked_children(struct mount *mnt, struct dentry *dentry)
{
- bool res;
-
- read_seqlock_excl(&mount_lock);
- res = __has_locked_children(mnt, dentry);
- read_sequnlock_excl(&mount_lock);
- return res;
+ guard(mount_locked_reader)();
+ return __has_locked_children(mnt, dentry);
}
/*
@@ -2341,21 +2345,15 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
* specified subtree. Such references can act as pins for mount namespaces
* that aren't checked by the mount-cycle checking code, thereby allowing
* cycles to be made.
+ *
+ * locks: mount_locked_reader || namespace_shared && pinned(subtree)
*/
static bool check_for_nsfs_mounts(struct mount *subtree)
{
- struct mount *p;
- bool ret = false;
-
- lock_mount_hash();
- for (p = subtree; p; p = next_mnt(p, subtree))
+ for (struct mount *p = subtree; p; p = next_mnt(p, subtree))
if (mnt_ns_loop(p->mnt.mnt_root))
- goto out;
-
- ret = true;
-out:
- unlock_mount_hash();
- return ret;
+ return false;
+ return true;
}
/**
@@ -2375,7 +2373,7 @@ struct vfsmount *clone_private_mount(const struct path *path)
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
- guard(rwsem_read)(&namespace_sem);
+ guard(namespace_shared)();
if (IS_MNT_UNBINDABLE(old_mnt))
return ERR_PTR(-EINVAL);
@@ -2496,8 +2494,7 @@ enum mnt_tree_flags_t {
/**
* attach_recursive_mnt - attach a source mount tree
* @source_mnt: mount tree to be attached
- * @dest_mnt: mount that @source_mnt will be mounted on
- * @dest_mp: the mountpoint @source_mnt will be mounted at
+ * @dest: the context for mounting at the place where the tree should go
*
* NOTE: in the table below explains the semantics when a source mount
* of a given type is attached to a destination mount of a given type.
@@ -2560,10 +2557,11 @@ enum mnt_tree_flags_t {
* Otherwise a negative error code is returned.
*/
static int attach_recursive_mnt(struct mount *source_mnt,
- struct mount *dest_mnt,
- struct mountpoint *dest_mp)
+ const struct pinned_mountpoint *dest)
{
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
+ struct mount *dest_mnt = dest->parent;
+ struct mountpoint *dest_mp = dest->mp;
HLIST_HEAD(tree_list);
struct mnt_namespace *ns = dest_mnt->mnt_ns;
struct pinned_mountpoint root = {};
@@ -2643,10 +2641,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
child->mnt_mountpoint);
commit_tree(child);
if (q) {
+ struct mount *r = topmost_overmount(child);
struct mountpoint *mp = root.mp;
- struct mount *r = child;
- while (unlikely(r->overmount))
- r = r->overmount;
+
if (unlikely(shorter) && child != source_mnt)
mp = shorter;
mnt_change_mountpoint(r, mp, q);
@@ -2675,110 +2672,120 @@ static int attach_recursive_mnt(struct mount *source_mnt,
return err;
}
+static inline struct mount *where_to_mount(const struct path *path,
+ struct dentry **dentry,
+ bool beneath)
+{
+ struct mount *m;
+
+ if (unlikely(beneath)) {
+ m = topmost_overmount(real_mount(path->mnt));
+ *dentry = m->mnt_mountpoint;
+ return m->mnt_parent;
+ }
+ m = __lookup_mnt(path->mnt, path->dentry);
+ if (unlikely(m)) {
+ m = topmost_overmount(m);
+ *dentry = m->mnt.mnt_root;
+ return m;
+ }
+ *dentry = path->dentry;
+ return real_mount(path->mnt);
+}
+
/**
- * do_lock_mount - lock mount and mountpoint
- * @path: target path
- * @beneath: whether the intention is to mount beneath @path
- *
- * Follow the mount stack on @path until the top mount @mnt is found. If
- * the initial @path->{mnt,dentry} is a mountpoint lookup the first
- * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
- * until nothing is stacked on top of it anymore.
+ * do_lock_mount - acquire environment for mounting
+ * @path: target path
+ * @res: context to set up
+ * @beneath: whether the intention is to mount beneath @path
*
- * Acquire the inode_lock() on the top mount's ->mnt_root to protect
- * against concurrent removal of the new mountpoint from another mount
- * namespace.
+ * To mount something at given location, we need
+ * namespace_sem locked exclusive
+ * inode of dentry we are mounting on locked exclusive
+ * struct mountpoint for that dentry
+ * struct mount we are mounting on
*
- * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
- * @mp on @mnt->mnt_parent must be acquired. This protects against a
- * concurrent unlink of @mp->mnt_dentry from another mount namespace
- * where @mnt doesn't have a child mount mounted @mp. A concurrent
- * removal of @mnt->mnt_root doesn't matter as nothing will be mounted
- * on top of it for @beneath.
+ * Results are stored in caller-supplied context (pinned_mountpoint);
+ * on success we have res->parent and res->mp pointing to parent and
+ * mountpoint respectively and res->node inserted into the ->m_list
+ * of the mountpoint, making sure the mountpoint won't disappear.
+ * On failure we have res->parent set to ERR_PTR(-E...), res->mp
+ * left NULL, res->node - empty.
+ * In case of success do_lock_mount returns with locks acquired (in
+ * proper order - inode lock nests outside of namespace_sem).
*
- * In addition, @beneath needs to make sure that @mnt hasn't been
- * unmounted or moved from its current mountpoint in between dropping
- * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
- * being unmounted would be detected later by e.g., calling
- * check_mnt(mnt) in the function it's called from. For the @beneath
- * case however, it's useful to detect it directly in do_lock_mount().
- * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
- * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
- * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
+ * Request to mount on overmounted location is treated as "mount on
+ * top of whatever's overmounting it"; request to mount beneath
+ * a location - "mount immediately beneath the topmost mount at that
+ * place".
*
- * Return: Either the target mountpoint on the top mount or the top
- * mount's mountpoint.
+ * In all cases the location must not have been unmounted and the
+ * chosen mountpoint must be allowed to be mounted on. For "beneath"
+ * case we also require the location to be at the root of a mount
+ * that has a parent (i.e. is not a root of some namespace).
*/
-static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath)
+static void do_lock_mount(const struct path *path,
+ struct pinned_mountpoint *res,
+ bool beneath)
{
- struct vfsmount *mnt = path->mnt;
- struct dentry *dentry;
- struct path under = {};
- int err = -ENOENT;
+ int err;
- for (;;) {
- struct mount *m = real_mount(mnt);
+ if (unlikely(beneath) && !path_mounted(path)) {
+ res->parent = ERR_PTR(-EINVAL);
+ return;
+ }
- if (beneath) {
- path_put(&under);
- read_seqlock_excl(&mount_lock);
- under.mnt = mntget(&m->mnt_parent->mnt);
- under.dentry = dget(m->mnt_mountpoint);
- read_sequnlock_excl(&mount_lock);
- dentry = under.dentry;
- } else {
- dentry = path->dentry;
+ do {
+ struct dentry *dentry, *d;
+ struct mount *m, *n;
+
+ scoped_guard(mount_locked_reader) {
+ m = where_to_mount(path, &dentry, beneath);
+ if (&m->mnt != path->mnt) {
+ mntget(&m->mnt);
+ dget(dentry);
+ }
}
inode_lock(dentry->d_inode);
namespace_lock();
- if (unlikely(cant_mount(dentry) || !is_mounted(mnt)))
- break; // not to be mounted on
+ // check if the chain of mounts (if any) has changed.
+ scoped_guard(mount_locked_reader)
+ n = where_to_mount(path, &d, beneath);
- if (beneath && unlikely(m->mnt_mountpoint != dentry ||
- &m->mnt_parent->mnt != under.mnt)) {
- namespace_unlock();
- inode_unlock(dentry->d_inode);
- continue; // got moved
- }
+ if (unlikely(n != m || dentry != d))
+ err = -EAGAIN; // something moved, retry
+ else if (unlikely(cant_mount(dentry) || !is_mounted(path->mnt)))
+ err = -ENOENT; // not to be mounted on
+ else if (beneath && &m->mnt == path->mnt && !m->overmount)
+ err = -EINVAL;
+ else
+ err = get_mountpoint(dentry, res);
- mnt = lookup_mnt(path);
- if (unlikely(mnt)) {
+ if (unlikely(err)) {
+ res->parent = ERR_PTR(err);
namespace_unlock();
inode_unlock(dentry->d_inode);
- path_put(path);
- path->mnt = mnt;
- path->dentry = dget(mnt->mnt_root);
- continue; // got overmounted
+ } else {
+ res->parent = m;
}
- err = get_mountpoint(dentry, pinned);
- if (err)
- break;
- if (beneath) {
- /*
- * @under duplicates the references that will stay
- * at least until namespace_unlock(), so the path_put()
- * below is safe (and OK to do under namespace_lock -
- * we are not dropping the final references here).
- */
- path_put(&under);
+ /*
+ * Drop the temporary references. This is subtle - on success
+ * we are doing that under namespace_sem, which would normally
+ * be forbidden. However, in that case we are guaranteed that
+ * refcounts won't reach zero, since we know that path->mnt
+ * is mounted and thus all mounts reachable from it are pinned
+ * and stable, along with their mountpoints and roots.
+ */
+ if (&m->mnt != path->mnt) {
+ dput(dentry);
+ mntput(&m->mnt);
}
- return 0;
- }
- namespace_unlock();
- inode_unlock(dentry->d_inode);
- if (beneath)
- path_put(&under);
- return err;
-}
-
-static inline int lock_mount(struct path *path, struct pinned_mountpoint *m)
-{
- return do_lock_mount(path, m, false);
+ } while (err == -EAGAIN);
}
-static void unlock_mount(struct pinned_mountpoint *m)
+static void __unlock_mount(struct pinned_mountpoint *m)
{
inode_unlock(m->mp->m_dentry->d_inode);
read_seqlock_excl(&mount_lock);
@@ -2787,16 +2794,30 @@ static void unlock_mount(struct pinned_mountpoint *m)
namespace_unlock();
}
-static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
+static inline void unlock_mount(struct pinned_mountpoint *m)
+{
+ if (!IS_ERR(m->parent))
+ __unlock_mount(m);
+}
+
+#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
+ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
+ do_lock_mount((path), &mp, (beneath))
+#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false)
+#define LOCK_MOUNT_EXACT(mp, path) \
+ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
+ lock_mount_exact((path), &mp)
+
+static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp)
{
if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
return -EINVAL;
- if (d_is_dir(mp->m_dentry) !=
+ if (d_is_dir(mp->mp->m_dentry) !=
d_is_dir(mnt->mnt.mnt_root))
return -ENOTDIR;
- return attach_recursive_mnt(mnt, p, mp);
+ return attach_recursive_mnt(mnt, mp);
}
static int may_change_propagation(const struct mount *m)
@@ -2832,13 +2853,13 @@ static int flags_to_propagation_type(int ms_flags)
/*
* recursively change the type of the mountpoint.
*/
-static int do_change_type(struct path *path, int ms_flags)
+static int do_change_type(const struct path *path, int ms_flags)
{
struct mount *m;
struct mount *mnt = real_mount(path->mnt);
int recurse = ms_flags & MS_REC;
int type;
- int err = 0;
+ int err;
if (!path_mounted(path))
return -EINVAL;
@@ -2847,23 +2868,22 @@ static int do_change_type(struct path *path, int ms_flags)
if (!type)
return -EINVAL;
- namespace_lock();
+ guard(namespace_excl)();
+
err = may_change_propagation(mnt);
if (err)
- goto out_unlock;
+ return err;
if (type == MS_SHARED) {
err = invent_group_ids(mnt, recurse);
if (err)
- goto out_unlock;
+ return err;
}
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
change_mnt_propagation(m, type);
- out_unlock:
- namespace_unlock();
- return err;
+ return 0;
}
/* may_copy_tree() - check if a mount tree can be copied
@@ -2909,7 +2929,7 @@ static int do_change_type(struct path *path, int ms_flags)
*
* Returns true if the mount tree can be copied, false otherwise.
*/
-static inline bool may_copy_tree(struct path *path)
+static inline bool may_copy_tree(const struct path *path)
{
struct mount *mnt = real_mount(path->mnt);
const struct dentry_operations *d_op;
@@ -2931,7 +2951,7 @@ static inline bool may_copy_tree(struct path *path)
}
-static struct mount *__do_loopback(struct path *old_path, int recurse)
+static struct mount *__do_loopback(const struct path *old_path, int recurse)
{
struct mount *old = real_mount(old_path->mnt);
@@ -2953,12 +2973,11 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
/*
* do loopback mount.
*/
-static int do_loopback(struct path *path, const char *old_name,
- int recurse)
+static int do_loopback(const struct path *path, const char *old_name,
+ int recurse)
{
- struct path old_path;
- struct mount *mnt = NULL, *parent;
- struct pinned_mountpoint mp = {};
+ struct path old_path __free(path_put) = {};
+ struct mount *mnt = NULL;
int err;
if (!old_name || !*old_name)
return -EINVAL;
@@ -2966,49 +2985,40 @@ static int do_loopback(struct path *path, const char *old_name,
if (err)
return err;
- err = -EINVAL;
if (mnt_ns_loop(old_path.dentry))
- goto out;
+ return -EINVAL;
- err = lock_mount(path, &mp);
- if (err)
- goto out;
+ LOCK_MOUNT(mp, path);
+ if (IS_ERR(mp.parent))
+ return PTR_ERR(mp.parent);
- parent = real_mount(path->mnt);
- if (!check_mnt(parent))
- goto out2;
+ if (!check_mnt(mp.parent))
+ return -EINVAL;
mnt = __do_loopback(&old_path, recurse);
- if (IS_ERR(mnt)) {
- err = PTR_ERR(mnt);
- goto out2;
- }
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
- err = graft_tree(mnt, parent, mp.mp);
+ err = graft_tree(mnt, &mp);
if (err) {
lock_mount_hash();
umount_tree(mnt, UMOUNT_SYNC);
unlock_mount_hash();
}
-out2:
- unlock_mount(&mp);
-out:
- path_put(&old_path);
return err;
}
-static struct file *open_detached_copy(struct path *path, bool recursive)
+static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive)
{
struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
struct user_namespace *user_ns = mnt_ns->user_ns;
struct mount *mnt, *p;
- struct file *file;
ns = alloc_mnt_ns(user_ns, true);
if (IS_ERR(ns))
- return ERR_CAST(ns);
+ return ns;
- namespace_lock();
+ guard(namespace_excl)();
/*
* Record the sequence number of the source mount namespace.
@@ -3025,23 +3035,28 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
mnt = __do_loopback(path, recursive);
if (IS_ERR(mnt)) {
- namespace_unlock();
- free_mnt_ns(ns);
+ emptied_ns = ns;
return ERR_CAST(mnt);
}
- lock_mount_hash();
for (p = mnt; p; p = next_mnt(p, mnt)) {
mnt_add_to_ns(ns, p);
ns->nr_mounts++;
}
ns->root = mnt;
- mntget(&mnt->mnt);
- unlock_mount_hash();
- namespace_unlock();
+ return ns;
+}
+
+static struct file *open_detached_copy(struct path *path, bool recursive)
+{
+ struct mnt_namespace *ns = get_detached_copy(path, recursive);
+ struct file *file;
+
+ if (IS_ERR(ns))
+ return ERR_CAST(ns);
mntput(path->mnt);
- path->mnt = &mnt->mnt;
+ path->mnt = mntget(&ns->root->mnt);
file = dentry_open(path, O_PATH, current_cred());
if (IS_ERR(file))
dissolve_on_fput(path->mnt);
@@ -3158,7 +3173,8 @@ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
touch_mnt_namespace(mnt->mnt_ns);
}
-static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
+static void mnt_warn_timestamp_expiry(const struct path *mountpoint,
+ struct vfsmount *mnt)
{
struct super_block *sb = mnt->mnt_sb;
@@ -3192,7 +3208,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
* superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND
* to mount(2).
*/
-static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
+static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags)
{
struct super_block *sb = path->mnt->mnt_sb;
struct mount *mnt = real_mount(path->mnt);
@@ -3229,7 +3245,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
* If you've mounted a non-root directory somewhere and want to do remount
* on it - tough luck.
*/
-static int do_remount(struct path *path, int sb_flags,
+static int do_remount(const struct path *path, int sb_flags,
int mnt_flags, void *data)
{
int err;
@@ -3287,49 +3303,46 @@ static inline int tree_contains_unbindable(struct mount *mnt)
return 0;
}
-static int do_set_group(struct path *from_path, struct path *to_path)
+static int do_set_group(const struct path *from_path, const struct path *to_path)
{
- struct mount *from, *to;
+ struct mount *from = real_mount(from_path->mnt);
+ struct mount *to = real_mount(to_path->mnt);
int err;
- from = real_mount(from_path->mnt);
- to = real_mount(to_path->mnt);
-
- namespace_lock();
+ guard(namespace_excl)();
err = may_change_propagation(from);
if (err)
- goto out;
+ return err;
err = may_change_propagation(to);
if (err)
- goto out;
+ return err;
- err = -EINVAL;
/* To and From paths should be mount roots */
if (!path_mounted(from_path))
- goto out;
+ return -EINVAL;
if (!path_mounted(to_path))
- goto out;
+ return -EINVAL;
/* Setting sharing groups is only allowed across same superblock */
if (from->mnt.mnt_sb != to->mnt.mnt_sb)
- goto out;
+ return -EINVAL;
/* From mount root should be wider than To mount root */
if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
- goto out;
+ return -EINVAL;
/* From mount should not have locked children in place of To's root */
if (__has_locked_children(from, to->mnt.mnt_root))
- goto out;
+ return -EINVAL;
/* Setting sharing groups is only allowed on private mounts */
if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
- goto out;
+ return -EINVAL;
/* From should not be private */
if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
- goto out;
+ return -EINVAL;
if (IS_MNT_SLAVE(from)) {
hlist_add_behind(&to->mnt_slave, &from->mnt_slave);
@@ -3341,11 +3354,7 @@ static int do_set_group(struct path *from_path, struct path *to_path)
list_add(&to->mnt_share, &from->mnt_share);
set_mnt_shared(to);
}
-
- err = 0;
-out:
- namespace_unlock();
- return err;
+ return 0;
}
/**
@@ -3389,17 +3398,15 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
/**
* can_move_mount_beneath - check that we can mount beneath the top mount
- * @from: mount to mount beneath
- * @to: mount under which to mount
- * @mp: mountpoint of @to
+ * @mnt_from: mount we are trying to move
+ * @mnt_to: mount under which to mount
+ * @mp: mountpoint of @mnt_to
*
- * - Make sure that @to->dentry is actually the root of a mount under
- * which we can mount another mount.
* - Make sure that nothing can be mounted beneath the caller's current
* root or the rootfs of the namespace.
* - Make sure that the caller can unmount the topmost mount ensuring
* that the caller could reveal the underlying mountpoint.
- * - Ensure that nothing has been mounted on top of @from before we
+ * - Ensure that nothing has been mounted on top of @mnt_from before we
* grabbed @namespace_sem to avoid creating pointless shadow mounts.
* - Prevent mounting beneath a mount if the propagation relationship
* between the source mount, parent mount, and top mount would lead to
@@ -3408,25 +3415,17 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
* Context: This function expects namespace_lock() to be held.
* Return: On success 0, and on error a negative error code is returned.
*/
-static int can_move_mount_beneath(const struct path *from,
- const struct path *to,
+static int can_move_mount_beneath(const struct mount *mnt_from,
+ const struct mount *mnt_to,
const struct mountpoint *mp)
{
- struct mount *mnt_from = real_mount(from->mnt),
- *mnt_to = real_mount(to->mnt),
- *parent_mnt_to = mnt_to->mnt_parent;
-
- if (!mnt_has_parent(mnt_to))
- return -EINVAL;
-
- if (!path_mounted(to))
- return -EINVAL;
+ struct mount *parent_mnt_to = mnt_to->mnt_parent;
if (IS_MNT_LOCKED(mnt_to))
return -EINVAL;
/* Avoid creating shadow mounts during mount propagation. */
- if (path_overmounted(from))
+ if (mnt_from->overmount)
return -EINVAL;
/*
@@ -3517,97 +3516,83 @@ static inline bool may_use_mount(struct mount *mnt)
return check_anonymous_mnt(mnt);
}
-static int do_move_mount(struct path *old_path,
- struct path *new_path, enum mnt_tree_flags_t flags)
+static int do_move_mount(const struct path *old_path,
+ const struct path *new_path,
+ enum mnt_tree_flags_t flags)
{
- struct mnt_namespace *ns;
- struct mount *p;
- struct mount *old;
- struct mount *parent;
- struct pinned_mountpoint mp;
+ struct mount *old = real_mount(old_path->mnt);
int err;
bool beneath = flags & MNT_TREE_BENEATH;
- err = do_lock_mount(new_path, &mp, beneath);
- if (err)
- return err;
+ if (!path_mounted(old_path))
+ return -EINVAL;
- old = real_mount(old_path->mnt);
- p = real_mount(new_path->mnt);
- parent = old->mnt_parent;
- ns = old->mnt_ns;
+ if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry))
+ return -EINVAL;
- err = -EINVAL;
+ LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath);
+ if (IS_ERR(mp.parent))
+ return PTR_ERR(mp.parent);
if (check_mnt(old)) {
/* if the source is in our namespace... */
/* ... it should be detachable from parent */
if (!mnt_has_parent(old) || IS_MNT_LOCKED(old))
- goto out;
+ return -EINVAL;
+ /* ... which should not be shared */
+ if (IS_MNT_SHARED(old->mnt_parent))
+ return -EINVAL;
/* ... and the target should be in our namespace */
- if (!check_mnt(p))
- goto out;
- /* parent of the source should not be shared */
- if (IS_MNT_SHARED(parent))
- goto out;
+ if (!check_mnt(mp.parent))
+ return -EINVAL;
} else {
/*
* otherwise the source must be the root of some anon namespace.
*/
if (!anon_ns_root(old))
- goto out;
+ return -EINVAL;
/*
* Bail out early if the target is within the same namespace -
* subsequent checks would've rejected that, but they lose
* some corner cases if we check it early.
*/
- if (ns == p->mnt_ns)
- goto out;
+ if (old->mnt_ns == mp.parent->mnt_ns)
+ return -EINVAL;
/*
* Target should be either in our namespace or in an acceptable
* anon namespace, sensu check_anonymous_mnt().
*/
- if (!may_use_mount(p))
- goto out;
+ if (!may_use_mount(mp.parent))
+ return -EINVAL;
}
- if (!path_mounted(old_path))
- goto out;
-
- if (d_is_dir(new_path->dentry) !=
- d_is_dir(old_path->dentry))
- goto out;
-
if (beneath) {
- err = can_move_mount_beneath(old_path, new_path, mp.mp);
- if (err)
- goto out;
+ struct mount *over = real_mount(new_path->mnt);
- err = -EINVAL;
- p = p->mnt_parent;
+ if (mp.parent != over->mnt_parent)
+ over = mp.parent->overmount;
+ err = can_move_mount_beneath(old, over, mp.mp);
+ if (err)
+ return err;
}
/*
* Don't move a mount tree containing unbindable mounts to a destination
* mount which is shared.
*/
- if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
- goto out;
- err = -ELOOP;
+ if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(old))
+ return -EINVAL;
if (!check_for_nsfs_mounts(old))
- goto out;
- if (mount_is_ancestor(old, p))
- goto out;
+ return -ELOOP;
+ if (mount_is_ancestor(old, mp.parent))
+ return -ELOOP;
- err = attach_recursive_mnt(old, p, mp.mp);
-out:
- unlock_mount(&mp);
- return err;
+ return attach_recursive_mnt(old, &mp);
}
-static int do_move_mount_old(struct path *path, const char *old_name)
+static int do_move_mount_old(const struct path *path, const char *old_name)
{
- struct path old_path;
+ struct path old_path __free(path_put) = {};
int err;
if (!old_name || !*old_name)
@@ -3617,18 +3602,19 @@ static int do_move_mount_old(struct path *path, const char *old_name)
if (err)
return err;
- err = do_move_mount(&old_path, path, 0);
- path_put(&old_path);
- return err;
+ return do_move_mount(&old_path, path, 0);
}
/*
* add a mount into a namespace's mount tree
*/
-static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
- const struct path *path, int mnt_flags)
+static int do_add_mount(struct mount *newmnt, const struct pinned_mountpoint *mp,
+ int mnt_flags)
{
- struct mount *parent = real_mount(path->mnt);
+ struct mount *parent = mp->parent;
+
+ if (IS_ERR(parent))
+ return PTR_ERR(parent);
mnt_flags &= ~MNT_INTERNAL_FLAGS;
@@ -3642,14 +3628,15 @@ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
}
/* Refuse the same filesystem on the same mount point */
- if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path))
+ if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb &&
+ parent->mnt.mnt_root == mp->mp->m_dentry)
return -EBUSY;
if (d_is_symlink(newmnt->mnt.mnt_root))
return -EINVAL;
newmnt->mnt.mnt_flags = mnt_flags;
- return graft_tree(newmnt, parent, mp);
+ return graft_tree(newmnt, mp);
}
static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
@@ -3658,41 +3645,32 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags
* Create a new mount using a superblock configuration and request it
* be added to the namespace tree.
*/
-static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
+static int do_new_mount_fc(struct fs_context *fc, const struct path *mountpoint,
unsigned int mnt_flags)
{
- struct vfsmount *mnt;
- struct pinned_mountpoint mp = {};
- struct super_block *sb = fc->root->d_sb;
+ struct super_block *sb;
+ struct vfsmount *mnt __free(mntput) = fc_mount(fc);
int error;
- error = security_sb_kern_mount(sb);
- if (!error && mount_too_revealing(sb, &mnt_flags)) {
- errorfcp(fc, "VFS", "Mount too revealing");
- error = -EPERM;
- }
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
- if (unlikely(error)) {
- fc_drop_locked(fc);
+ sb = fc->root->d_sb;
+ error = security_sb_kern_mount(sb);
+ if (unlikely(error))
return error;
- }
- up_write(&sb->s_umount);
-
- mnt = vfs_create_mount(fc);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
+ if (unlikely(mount_too_revealing(sb, &mnt_flags))) {
+ errorfcp(fc, "VFS", "Mount too revealing");
+ return -EPERM;
+ }
mnt_warn_timestamp_expiry(mountpoint, mnt);
- error = lock_mount(mountpoint, &mp);
- if (!error) {
- error = do_add_mount(real_mount(mnt), mp.mp,
- mountpoint, mnt_flags);
- unlock_mount(&mp);
- }
- if (error < 0)
- mntput(mnt);
+ LOCK_MOUNT(mp, mountpoint);
+ error = do_add_mount(real_mount(mnt), &mp, mnt_flags);
+ if (!error)
+ retain_and_null_ptr(mnt); // consumed on success
return error;
}
@@ -3700,8 +3678,9 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
* create a new mount for userspace and request it to be added into the
* namespace's tree
*/
-static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
- int mnt_flags, const char *name, void *data)
+static int do_new_mount(const struct path *path, const char *fstype,
+ int sb_flags, int mnt_flags,
+ const char *name, void *data)
{
struct file_system_type *type;
struct fs_context *fc;
@@ -3747,18 +3726,38 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
if (!err && !mount_capable(fc))
err = -EPERM;
if (!err)
- err = vfs_get_tree(fc);
- if (!err)
err = do_new_mount_fc(fc, path, mnt_flags);
put_fs_context(fc);
return err;
}
-int finish_automount(struct vfsmount *m, const struct path *path)
+static void lock_mount_exact(const struct path *path,
+ struct pinned_mountpoint *mp)
{
struct dentry *dentry = path->dentry;
- struct pinned_mountpoint mp = {};
+ int err;
+
+ inode_lock(dentry->d_inode);
+ namespace_lock();
+ if (unlikely(cant_mount(dentry)))
+ err = -ENOENT;
+ else if (path_overmounted(path))
+ err = -EBUSY;
+ else
+ err = get_mountpoint(dentry, mp);
+ if (unlikely(err)) {
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
+ mp->parent = ERR_PTR(err);
+ } else {
+ mp->parent = real_mount(path->mnt);
+ }
+}
+
+int finish_automount(struct vfsmount *__m, const struct path *path)
+{
+ struct vfsmount *m __free(mntput) = __m;
struct mount *mnt;
int err;
@@ -3769,43 +3768,21 @@ int finish_automount(struct vfsmount *m, const struct path *path)
mnt = real_mount(m);
- if (m->mnt_sb == path->mnt->mnt_sb &&
- m->mnt_root == dentry) {
- err = -ELOOP;
- goto discard;
- }
+ if (m->mnt_root == path->dentry)
+ return -ELOOP;
/*
- * we don't want to use lock_mount() - in this case finding something
+ * we don't want to use LOCK_MOUNT() - in this case finding something
* that overmounts our mountpoint to be means "quitely drop what we've
* got", not "try to mount it on top".
*/
- inode_lock(dentry->d_inode);
- namespace_lock();
- if (unlikely(cant_mount(dentry))) {
- err = -ENOENT;
- goto discard_locked;
- }
- if (path_overmounted(path)) {
- err = 0;
- goto discard_locked;
- }
- err = get_mountpoint(dentry, &mp);
- if (err)
- goto discard_locked;
-
- err = do_add_mount(mnt, mp.mp, path,
- path->mnt->mnt_flags | MNT_SHRINKABLE);
- unlock_mount(&mp);
- if (unlikely(err))
- goto discard;
- return 0;
+ LOCK_MOUNT_EXACT(mp, path);
+ if (mp.parent == ERR_PTR(-EBUSY))
+ return 0;
-discard_locked:
- namespace_unlock();
- inode_unlock(dentry->d_inode);
-discard:
- mntput(m);
+ err = do_add_mount(mnt, &mp, path->mnt->mnt_flags | MNT_SHRINKABLE);
+ if (likely(!err))
+ retain_and_null_ptr(m);
return err;
}
@@ -3816,9 +3793,8 @@ discard:
*/
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
- read_seqlock_excl(&mount_lock);
+ guard(mount_locked_reader)();
list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
- read_sequnlock_excl(&mount_lock);
}
EXPORT_SYMBOL(mnt_set_expiry);
@@ -3835,8 +3811,8 @@ void mark_mounts_for_expiry(struct list_head *mounts)
if (list_empty(mounts))
return;
- namespace_lock();
- lock_mount_hash();
+ guard(namespace_excl)();
+ guard(mount_writer)();
/* extract from the expiration list every vfsmount that matches the
* following criteria:
@@ -3858,8 +3834,6 @@ void mark_mounts_for_expiry(struct list_head *mounts)
touch_mnt_namespace(mnt->mnt_ns);
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
}
- unlock_mount_hash();
- namespace_unlock();
}
EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
@@ -3987,7 +3961,7 @@ static char *copy_mount_string(const void __user *data)
* Therefore, if this magic number is present, it carries no information
* and must be discarded.
*/
-int path_mount(const char *dev_name, struct path *path,
+int path_mount(const char *dev_name, const struct path *path,
const char *type_page, unsigned long flags, void *data_page)
{
unsigned int mnt_flags = 0, sb_flags;
@@ -4069,15 +4043,13 @@ int path_mount(const char *dev_name, struct path *path,
int do_mount(const char *dev_name, const char __user *dir_name,
const char *type_page, unsigned long flags, void *data_page)
{
- struct path path;
+ struct path path __free(path_put) = {};
int ret;
ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
if (ret)
return ret;
- ret = path_mount(dev_name, &path, type_page, flags, data_page);
- path_put(&path);
- return ret;
+ return path_mount(dev_name, &path, type_page, flags, data_page);
}
static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
@@ -4138,7 +4110,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
struct user_namespace *user_ns, struct fs_struct *new_fs)
{
struct mnt_namespace *new_ns;
- struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
+ struct vfsmount *rootmnt __free(mntput) = NULL;
+ struct vfsmount *pwdmnt __free(mntput) = NULL;
struct mount *p, *q;
struct mount *old;
struct mount *new;
@@ -4157,23 +4130,19 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
if (IS_ERR(new_ns))
return new_ns;
- namespace_lock();
+ guard(namespace_excl)();
/* First pass: copy the tree topology */
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
if (user_ns != ns->user_ns)
copy_flags |= CL_SLAVE;
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
- namespace_unlock();
- ns_common_free(ns);
- dec_mnt_namespaces(new_ns->ucounts);
- mnt_ns_release(new_ns);
+ emptied_ns = new_ns;
return ERR_CAST(new);
}
if (user_ns != ns->user_ns) {
- lock_mount_hash();
+ guard(mount_writer)();
lock_mnt_tree(new);
- unlock_mount_hash();
}
new_ns->root = new;
@@ -4205,13 +4174,6 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
while (p->mnt.mnt_root != q->mnt.mnt_root)
p = next_mnt(skip_mnt_tree(p), old);
}
- namespace_unlock();
-
- if (rootmnt)
- mntput(rootmnt);
- if (pwdmnt)
- mntput(pwdmnt);
-
ns_tree_add_raw(new_ns);
return new_ns;
}
@@ -4436,7 +4398,8 @@ err_unlock:
return ret;
}
-static inline int vfs_move_mount(struct path *from_path, struct path *to_path,
+static inline int vfs_move_mount(const struct path *from_path,
+ const struct path *to_path,
enum mnt_tree_flags_t mflags)
{
int ret;
@@ -4542,7 +4505,7 @@ SYSCALL_DEFINE5(move_mount,
/*
* Return true if path is reachable from root
*
- * namespace_sem or mount_lock is held
+ * locks: mount_locked_reader || namespace_shared && is_mounted(mnt)
*/
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
const struct path *root)
@@ -4556,11 +4519,8 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
bool path_is_under(const struct path *path1, const struct path *path2)
{
- bool res;
- read_seqlock_excl(&mount_lock);
- res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
- read_sequnlock_excl(&mount_lock);
- return res;
+ guard(mount_locked_reader)();
+ return is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
}
EXPORT_SYMBOL(path_is_under);
@@ -4592,9 +4552,10 @@ EXPORT_SYMBOL(path_is_under);
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
const char __user *, put_old)
{
- struct path new, old, root;
+ struct path new __free(path_put) = {};
+ struct path old __free(path_put) = {};
+ struct path root __free(path_put) = {};
struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
- struct pinned_mountpoint old_mp = {};
int error;
if (!may_mount())
@@ -4603,57 +4564,54 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
error = user_path_at(AT_FDCWD, new_root,
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
if (error)
- goto out0;
+ return error;
error = user_path_at(AT_FDCWD, put_old,
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
if (error)
- goto out1;
+ return error;
error = security_sb_pivotroot(&old, &new);
if (error)
- goto out2;
+ return error;
get_fs_root(current->fs, &root);
- error = lock_mount(&old, &old_mp);
- if (error)
- goto out3;
- error = -EINVAL;
+ LOCK_MOUNT(old_mp, &old);
+ old_mnt = old_mp.parent;
+ if (IS_ERR(old_mnt))
+ return PTR_ERR(old_mnt);
+
new_mnt = real_mount(new.mnt);
root_mnt = real_mount(root.mnt);
- old_mnt = real_mount(old.mnt);
ex_parent = new_mnt->mnt_parent;
root_parent = root_mnt->mnt_parent;
if (IS_MNT_SHARED(old_mnt) ||
IS_MNT_SHARED(ex_parent) ||
IS_MNT_SHARED(root_parent))
- goto out4;
+ return -EINVAL;
if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
- goto out4;
+ return -EINVAL;
if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
- goto out4;
- error = -ENOENT;
+ return -EINVAL;
if (d_unlinked(new.dentry))
- goto out4;
- error = -EBUSY;
+ return -ENOENT;
if (new_mnt == root_mnt || old_mnt == root_mnt)
- goto out4; /* loop, on the same file system */
- error = -EINVAL;
+ return -EBUSY; /* loop, on the same file system */
if (!path_mounted(&root))
- goto out4; /* not a mountpoint */
+ return -EINVAL; /* not a mountpoint */
if (!mnt_has_parent(root_mnt))
- goto out4; /* absolute root */
+ return -EINVAL; /* absolute root */
if (!path_mounted(&new))
- goto out4; /* not a mountpoint */
+ return -EINVAL; /* not a mountpoint */
if (!mnt_has_parent(new_mnt))
- goto out4; /* absolute root */
+ return -EINVAL; /* absolute root */
/* make sure we can reach put_old from new_root */
- if (!is_path_reachable(old_mnt, old.dentry, &new))
- goto out4;
+ if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new))
+ return -EINVAL;
/* make certain new is below the root */
if (!is_path_reachable(new_mnt, new.dentry, &root))
- goto out4;
+ return -EINVAL;
lock_mount_hash();
umount_mnt(new_mnt);
if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
@@ -4672,17 +4630,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
mnt_notify_add(root_mnt);
mnt_notify_add(new_mnt);
chroot_fs_refs(&root, &new);
- error = 0;
-out4:
- unlock_mount(&old_mp);
-out3:
- path_put(&root);
-out2:
- path_put(&old);
-out1:
- path_put(&new);
-out0:
- return error;
+ return 0;
}
static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
@@ -4772,8 +4720,10 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
if (!mnt_allow_writers(kattr, m)) {
err = mnt_hold_writers(m);
- if (err)
+ if (err) {
+ m = next_mnt(m, mnt);
break;
+ }
}
if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
@@ -4781,25 +4731,9 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
}
if (err) {
- struct mount *p;
-
- /*
- * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
- * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
- * mounts and needs to take care to include the first mount.
- */
- for (p = mnt; p; p = next_mnt(p, mnt)) {
- /* If we had to hold writers unblock them. */
- if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
- mnt_unhold_writers(p);
-
- /*
- * We're done once the first mount we changed got
- * MNT_WRITE_HOLD unset.
- */
- if (p == m)
- break;
- }
+ /* undo all mnt_hold_writers() we'd done */
+ for (struct mount *p = mnt; p != m; p = next_mnt(p, mnt))
+ mnt_unhold_writers(p);
}
return err;
}
@@ -4830,8 +4764,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
WRITE_ONCE(m->mnt.mnt_flags, flags);
/* If we had to hold writers unblock them. */
- if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
- mnt_unhold_writers(m);
+ mnt_unhold_writers(m);
if (kattr->propagation)
change_mnt_propagation(m, kattr->propagation);
@@ -4841,7 +4774,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
touch_mnt_namespace(mnt->mnt_ns);
}
-static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
+static int do_mount_setattr(const struct path *path, struct mount_kattr *kattr)
{
struct mount *mnt = real_mount(path->mnt);
int err = 0;
@@ -5639,6 +5572,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
STATMOUNT_MNT_UIDMAP | \
STATMOUNT_MNT_GIDMAP)
+/* locks: namespace_shared */
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
struct mnt_namespace *ns)
{
@@ -5885,7 +5819,7 @@ retry:
if (ret)
return ret;
- scoped_guard(rwsem_read, &namespace_sem)
+ scoped_guard(namespace_shared)
ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns);
if (!ret)
@@ -5906,6 +5840,7 @@ struct klistmount {
struct path root;
};
+/* locks: namespace_shared */
static ssize_t do_listmount(struct klistmount *kls, bool reverse)
{
struct mnt_namespace *ns = kls->ns;
@@ -6040,7 +5975,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
* We only need to guard against mount topology changes as
* listmount() doesn't care about any mount properties.
*/
- scoped_guard(rwsem_read, &namespace_sem)
+ scoped_guard(namespace_shared)
ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE));
if (ret <= 0)
return ret;
@@ -6127,12 +6062,10 @@ void put_mnt_ns(struct mnt_namespace *ns)
{
if (!ns_ref_put(ns))
return;
- namespace_lock();
+ guard(namespace_excl)();
emptied_ns = ns;
- lock_mount_hash();
+ guard(mount_writer)();
umount_tree(ns->root, 0);
- unlock_mount_hash();
- namespace_unlock();
}
struct vfsmount *kern_mount(struct file_system_type *type)
@@ -6181,25 +6114,18 @@ bool our_mnt(struct vfsmount *mnt)
bool current_chrooted(void)
{
/* Does the current process have a non-standard root */
- struct path ns_root;
- struct path fs_root;
- bool chrooted;
-
- /* Find the namespace root */
- ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
- ns_root.dentry = ns_root.mnt->mnt_root;
- path_get(&ns_root);
- while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
- ;
+ struct path fs_root __free(path_put) = {};
+ struct mount *root;
get_fs_root(current->fs, &fs_root);
- chrooted = !path_equal(&fs_root, &ns_root);
+ /* Find the namespace root */
+
+ guard(mount_locked_reader)();
- path_put(&fs_root);
- path_put(&ns_root);
+ root = topmost_overmount(current->nsproxy->mnt_ns->root);
- return chrooted;
+ return fs_root.mnt != &root->mnt || !path_mounted(&fs_root);
}
static bool mnt_already_visible(struct mnt_namespace *ns,
@@ -6208,9 +6134,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
{
int new_flags = *new_mnt_flags;
struct mount *mnt, *n;
- bool visible = false;
- down_read(&namespace_sem);
+ guard(namespace_shared)();
rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
struct mount *child;
int mnt_flags;
@@ -6257,13 +6182,10 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
/* Preserve the locked attributes */
*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
MNT_LOCK_ATIME);
- visible = true;
- goto found;
+ return true;
next: ;
}
-found:
- up_read(&namespace_sem);
- return visible;
+ return false;
}
static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
diff --git a/fs/pnode.c b/fs/pnode.c
index 6f7d02f3fa98..5d91c3e58d2a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -29,6 +29,7 @@ static inline struct mount *next_slave(struct mount *p)
return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave);
}
+/* locks: namespace_shared && is_mounted(mnt) */
static struct mount *get_peer_under_root(struct mount *mnt,
struct mnt_namespace *ns,
const struct path *root)
@@ -50,7 +51,7 @@ static struct mount *get_peer_under_root(struct mount *mnt,
* Get ID of closest dominating peer group having a representative
* under the given root.
*
- * Caller must hold namespace_sem
+ * locks: namespace_shared
*/
int get_dominating_id(struct mount *mnt, const struct path *root)
{
@@ -70,19 +71,6 @@ static inline bool will_be_unmounted(struct mount *m)
return m->mnt.mnt_flags & MNT_UMOUNT;
}
-static struct mount *propagation_source(struct mount *mnt)
-{
- do {
- struct mount *m;
- for (m = next_peer(mnt); m != mnt; m = next_peer(m)) {
- if (!will_be_unmounted(m))
- return m;
- }
- mnt = mnt->mnt_master;
- } while (mnt && will_be_unmounted(mnt));
- return mnt;
-}
-
static void transfer_propagation(struct mount *mnt, struct mount *to)
{
struct hlist_node *p = NULL, *n;
@@ -111,11 +99,10 @@ void change_mnt_propagation(struct mount *mnt, int type)
return;
}
if (IS_MNT_SHARED(mnt)) {
- if (type == MS_SLAVE || !hlist_empty(&mnt->mnt_slave_list))
- m = propagation_source(mnt);
if (list_empty(&mnt->mnt_share)) {
mnt_release_group_id(mnt);
} else {
+ m = next_peer(mnt);
list_del_init(&mnt->mnt_share);
mnt->mnt_group_id = 0;
}
@@ -136,6 +123,57 @@ void change_mnt_propagation(struct mount *mnt, int type)
}
}
+static struct mount *trace_transfers(struct mount *m)
+{
+ while (1) {
+ struct mount *next = next_peer(m);
+
+ if (next != m) {
+ list_del_init(&m->mnt_share);
+ m->mnt_group_id = 0;
+ m->mnt_master = next;
+ } else {
+ if (IS_MNT_SHARED(m))
+ mnt_release_group_id(m);
+ next = m->mnt_master;
+ }
+ hlist_del_init(&m->mnt_slave);
+ CLEAR_MNT_SHARED(m);
+ SET_MNT_MARK(m);
+
+ if (!next || !will_be_unmounted(next))
+ return next;
+ if (IS_MNT_MARKED(next))
+ return next->mnt_master;
+ m = next;
+ }
+}
+
+static void set_destinations(struct mount *m, struct mount *master)
+{
+ struct mount *next;
+
+ while ((next = m->mnt_master) != master) {
+ m->mnt_master = master;
+ m = next;
+ }
+}
+
+void bulk_make_private(struct list_head *set)
+{
+ struct mount *m;
+
+ list_for_each_entry(m, set, mnt_list)
+ if (!IS_MNT_MARKED(m))
+ set_destinations(m, trace_transfers(m));
+
+ list_for_each_entry(m, set, mnt_list) {
+ transfer_propagation(m, m->mnt_master);
+ m->mnt_master = NULL;
+ CLEAR_MNT_MARK(m);
+ }
+}
+
static struct mount *__propagation_next(struct mount *m,
struct mount *origin)
{
@@ -304,9 +342,8 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
err = PTR_ERR(this);
break;
}
- read_seqlock_excl(&mount_lock);
- mnt_set_mountpoint(n, dest_mp, this);
- read_sequnlock_excl(&mount_lock);
+ scoped_guard(mount_locked_reader)
+ mnt_set_mountpoint(n, dest_mp, this);
if (n->mnt_master)
SET_MNT_MARK(n->mnt_master);
copy = this;
diff --git a/fs/pnode.h b/fs/pnode.h
index 00ab153e3e9d..b029db225f33 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -42,6 +42,7 @@ static inline bool peers(const struct mount *m1, const struct mount *m2)
}
void change_mnt_propagation(struct mount *, int);
+void bulk_make_private(struct list_head *);
int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
struct hlist_head *);
void propagate_umount(struct list_head *);
diff --git a/fs/super.c b/fs/super.c
index f4fa0e93c463..5bab94fb7e03 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -323,7 +323,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
if (!s)
return NULL;
- INIT_LIST_HEAD(&s->s_mounts);
s->s_user_ns = get_user_ns(user_ns);
init_rwsem(&s->s_umount);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -408,7 +407,7 @@ static void __put_super(struct super_block *s)
list_del_init(&s->s_list);
WARN_ON(s->s_dentry_lru.node);
WARN_ON(s->s_inode_lru.node);
- WARN_ON(!list_empty(&s->s_mounts));
+ WARN_ON(s->s_mounts);
call_rcu(&s->rcu, destroy_super_rcu);
}
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75fb216b0f7a..d3c023ff1a86 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1434,6 +1434,8 @@ struct sb_writers {
struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
};
+struct mount;
+
struct super_block {
struct list_head s_list; /* Keep this first */
dev_t s_dev; /* search index; _not_ kdev_t */
@@ -1468,7 +1470,7 @@ struct super_block {
__u16 s_encoding_flags;
#endif
struct hlist_bl_head s_roots; /* alternate root dentries for NFS */
- struct list_head s_mounts; /* list of mounts; _not_ for fs use */
+ struct mount *s_mounts; /* list of mounts; _not_ for fs use */
struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */
struct file *s_bdev_file;
struct backing_dev_info *s_bdi;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 5f9c053b0897..acfe7ef86a1b 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -33,7 +33,6 @@ enum mount_flags {
MNT_NOSYMFOLLOW = 0x80,
MNT_SHRINKABLE = 0x100,
- MNT_WRITE_HOLD = 0x200,
MNT_INTERNAL = 0x4000,
@@ -52,7 +51,7 @@ enum mount_flags {
| MNT_READONLY | MNT_NOSYMFOLLOW,
MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME,
- MNT_INTERNAL_FLAGS = MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED |
+ MNT_INTERNAL_FLAGS = MNT_INTERNAL | MNT_DOOMED |
MNT_SYNC_UMOUNT | MNT_LOCKED
};
@@ -77,7 +76,7 @@ extern void mntput(struct vfsmount *mnt);
extern struct vfsmount *mntget(struct vfsmount *mnt);
extern void mnt_make_shortterm(struct vfsmount *mnt);
extern struct vfsmount *mnt_clone_internal(const struct path *path);
-extern bool __mnt_is_readonly(struct vfsmount *mnt);
+extern bool __mnt_is_readonly(const struct vfsmount *mnt);
extern bool mnt_may_suid(struct vfsmount *mnt);
extern struct vfsmount *clone_private_mount(const struct path *path);
@@ -104,8 +103,8 @@ extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *);
int do_mount(const char *, const char __user *,
const char *, unsigned long, void *);
-extern struct path *collect_paths(const struct path *, struct path *, unsigned);
-extern void drop_collected_paths(struct path *, struct path *);
+extern const struct path *collect_paths(const struct path *, struct path *, unsigned);
+extern void drop_collected_paths(const struct path *, const struct path *);
extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num);
extern int cifs_root_data(char **dev, char **opts);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1605df0a171e..fda6beb041e0 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -680,7 +680,7 @@ void audit_trim_trees(void)
struct audit_tree *tree;
struct path path;
struct audit_node *node;
- struct path *paths;
+ const struct path *paths;
struct path array[16];
int err;
@@ -703,7 +703,7 @@ void audit_trim_trees(void)
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
node->index |= 1U<<31;
- for (struct path *p = paths; p->dentry; p++) {
+ for (const struct path *p = paths; p->dentry; p++) {
struct inode *inode = p->dentry->d_inode;
if (inode_to_key(inode) == chunk->key) {
node->index &= ~(1U<<31);
@@ -742,9 +742,9 @@ void audit_put_tree(struct audit_tree *tree)
put_tree(tree);
}
-static int tag_mounts(struct path *paths, struct audit_tree *tree)
+static int tag_mounts(const struct path *paths, struct audit_tree *tree)
{
- for (struct path *p = paths; p->dentry; p++) {
+ for (const struct path *p = paths; p->dentry; p++) {
int err = tag_chunk(p->dentry->d_inode, tree);
if (err)
return err;
@@ -807,7 +807,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
struct audit_tree *seed = rule->tree, *tree;
struct path path;
struct path array[16];
- struct path *paths;
+ const struct path *paths;
int err;
rule->tree = NULL;
@@ -879,7 +879,7 @@ int audit_tag_tree(char *old, char *new)
int failed = 0;
struct path path1, path2;
struct path array[16];
- struct path *paths;
+ const struct path *paths;
int err;
err = kern_path(new, 0, &path2);