diff options
Diffstat (limited to 'fs/namespace.c')
| -rw-r--r-- | fs/namespace.c | 818 | 
1 files changed, 354 insertions, 464 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index 7ca4612c7ae9..ddfd4457d338 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -79,6 +79,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;  static DECLARE_RWSEM(namespace_sem);  static HLIST_HEAD(unmounted);	/* protected by namespace_sem */  static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */  static DEFINE_SEQLOCK(mnt_ns_tree_lock);  #ifdef CONFIG_FSNOTIFY @@ -380,10 +381,9 @@ static struct mount *alloc_vfsmnt(const char *name)  		INIT_LIST_HEAD(&mnt->mnt_list);  		INIT_LIST_HEAD(&mnt->mnt_expire);  		INIT_LIST_HEAD(&mnt->mnt_share); -		INIT_LIST_HEAD(&mnt->mnt_slave_list); -		INIT_LIST_HEAD(&mnt->mnt_slave); +		INIT_HLIST_HEAD(&mnt->mnt_slave_list); +		INIT_HLIST_NODE(&mnt->mnt_slave);  		INIT_HLIST_NODE(&mnt->mnt_mp_list); -		INIT_LIST_HEAD(&mnt->mnt_umounting);  		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);  		RB_CLEAR_NODE(&mnt->mnt_node);  		mnt->mnt.mnt_idmap = &nop_mnt_idmap; @@ -894,7 +894,7 @@ struct vfsmount *lookup_mnt(const struct path *path)   * namespace not just a mount that happens to have some specified   * parent mount.   */ -bool __is_local_mountpoint(struct dentry *dentry) +bool __is_local_mountpoint(const struct dentry *dentry)  {  	struct mnt_namespace *ns = current->nsproxy->mnt_ns;  	struct mount *mnt, *n; @@ -911,42 +911,48 @@ bool __is_local_mountpoint(struct dentry *dentry)  	return is_covered;  } -static struct mountpoint *lookup_mountpoint(struct dentry *dentry) +struct pinned_mountpoint { +	struct hlist_node node; +	struct mountpoint *mp; +}; + +static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)  {  	struct hlist_head *chain = mp_hash(dentry);  	struct mountpoint *mp;  	hlist_for_each_entry(mp, chain, m_hash) {  		if (mp->m_dentry == dentry) { -			mp->m_count++; -			return mp; +			hlist_add_head(&m->node, &mp->m_list); +			m->mp = mp; +			return true;  		}  	} -	return NULL; +	return false;  } -static struct mountpoint *get_mountpoint(struct dentry *dentry) +static int get_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)  { -	struct mountpoint *mp, *new = NULL; +	struct mountpoint *mp __free(kfree) = NULL; +	bool found;  	int ret;  	if (d_mountpoint(dentry)) {  		/* might be worth a WARN_ON() */  		if (d_unlinked(dentry)) -			return ERR_PTR(-ENOENT); +			return -ENOENT;  mountpoint:  		read_seqlock_excl(&mount_lock); -		mp = lookup_mountpoint(dentry); +		found = lookup_mountpoint(dentry, m);  		read_sequnlock_excl(&mount_lock); -		if (mp) -			goto done; +		if (found) +			return 0;  	} -	if (!new) -		new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); -	if (!new) -		return ERR_PTR(-ENOMEM); - +	if (!mp) +		mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); +	if (!mp) +		return -ENOMEM;  	/* Exactly one processes may set d_mounted */  	ret = d_set_mounted(dentry); @@ -956,34 +962,28 @@ mountpoint:  		goto mountpoint;  	/* The dentry is not available as a mountpoint? */ -	mp = ERR_PTR(ret);  	if (ret) -		goto done; +		return ret;  	/* Add the new mountpoint to the hash table */  	read_seqlock_excl(&mount_lock); -	new->m_dentry = dget(dentry); -	new->m_count = 1; -	hlist_add_head(&new->m_hash, mp_hash(dentry)); -	INIT_HLIST_HEAD(&new->m_list); +	mp->m_dentry = dget(dentry); +	hlist_add_head(&mp->m_hash, mp_hash(dentry)); +	INIT_HLIST_HEAD(&mp->m_list); +	hlist_add_head(&m->node, &mp->m_list); +	m->mp = no_free_ptr(mp);  	read_sequnlock_excl(&mount_lock); - -	mp = new; -	new = NULL; -done: -	kfree(new); -	return mp; +	return 0;  }  /*   * vfsmount lock must be held.  Additionally, the caller is responsible   * for serializing calls for given disposal list.   */ -static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) +static void maybe_free_mountpoint(struct mountpoint *mp, struct list_head *list)  { -	if (!--mp->m_count) { +	if (hlist_empty(&mp->m_list)) {  		struct dentry *dentry = mp->m_dentry; -		BUG_ON(!hlist_empty(&mp->m_list));  		spin_lock(&dentry->d_lock);  		dentry->d_flags &= ~DCACHE_MOUNTED;  		spin_unlock(&dentry->d_lock); @@ -993,10 +993,15 @@ static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)  	}  } -/* called with namespace_lock and vfsmount lock */ -static void put_mountpoint(struct mountpoint *mp) +/* + * locks: mount_lock [read_seqlock_excl], namespace_sem [excl] + */ +static void unpin_mountpoint(struct pinned_mountpoint *m)  { -	__put_mountpoint(mp, &ex_mountpoints); +	if (m->mp) { +		hlist_del(&m->node); +		maybe_free_mountpoint(m->mp, &ex_mountpoints); +	}  }  static inline int check_mnt(struct mount *mnt) @@ -1038,11 +1043,14 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)  }  /* - * vfsmount lock must be held for write + * locks: mount_lock[write_seqlock]   */ -static struct mountpoint *unhash_mnt(struct mount *mnt) +static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list)  {  	struct mountpoint *mp; +	struct mount *parent = mnt->mnt_parent; +	if (unlikely(parent->overmount == mnt)) +		parent->overmount = NULL;  	mnt->mnt_parent = mnt;  	mnt->mnt_mountpoint = mnt->mnt.mnt_root;  	list_del_init(&mnt->mnt_child); @@ -1050,15 +1058,15 @@ static struct mountpoint *unhash_mnt(struct mount *mnt)  	hlist_del_init(&mnt->mnt_mp_list);  	mp = mnt->mnt_mp;  	mnt->mnt_mp = NULL; -	return mp; +	maybe_free_mountpoint(mp, shrink_list);  }  /* - * vfsmount lock must be held for write + * locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints)   */  static void umount_mnt(struct mount *mnt)  { -	put_mountpoint(unhash_mnt(mnt)); +	__umount_mnt(mnt, &ex_mountpoints);  }  /* @@ -1068,43 +1076,17 @@ void mnt_set_mountpoint(struct mount *mnt,  			struct mountpoint *mp,  			struct mount *child_mnt)  { -	mp->m_count++; -	mnt_add_count(mnt, 1);	/* essentially, that's mntget */  	child_mnt->mnt_mountpoint = mp->m_dentry;  	child_mnt->mnt_parent = mnt;  	child_mnt->mnt_mp = mp;  	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);  } -/** - * mnt_set_mountpoint_beneath - mount a mount beneath another one - * - * @new_parent: the source mount - * @top_mnt:    the mount beneath which @new_parent is mounted - * @new_mp:     the new mountpoint of @top_mnt on @new_parent - * - * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and - * parent @top_mnt->mnt_parent and mount it on top of @new_parent at - * @new_mp. And mount @new_parent on the old parent and old - * mountpoint of @top_mnt. - * - * Context: This function expects namespace_lock() and lock_mount_hash() - *          to have been acquired in that order. - */ -static void mnt_set_mountpoint_beneath(struct mount *new_parent, -				       struct mount *top_mnt, -				       struct mountpoint *new_mp) -{ -	struct mount *old_top_parent = top_mnt->mnt_parent; -	struct mountpoint *old_top_mp = top_mnt->mnt_mp; - -	mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent); -	mnt_change_mountpoint(new_parent, new_mp, top_mnt); -} - - -static void __attach_mnt(struct mount *mnt, struct mount *parent) +static void make_visible(struct mount *mnt)  { +	struct mount *parent = mnt->mnt_parent; +	if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root)) +		parent->overmount = mnt;  	hlist_add_head_rcu(&mnt->mnt_hash,  			   m_hash(&parent->mnt, mnt->mnt_mountpoint));  	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); @@ -1116,51 +1098,34 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent)   * @parent:  the parent   * @mnt:     the new mount   * @mp:      the new mountpoint - * @beneath: whether to mount @mnt beneath or on top of @parent   * - * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt + * Mount @mnt at @mp on @parent. Then attach @mnt   * to @parent's child mount list and to @mount_hashtable.   * - * If @beneath is true, remove @mnt from its current parent and - * mountpoint and mount it on @mp on @parent, and mount @parent on the - * old parent and old mountpoint of @mnt. Finally, attach @parent to - * @mnt_hashtable and @parent->mnt_parent->mnt_mounts. - * - * Note, when __attach_mnt() is called @mnt->mnt_parent already points + * Note, when make_visible() is called @mnt->mnt_parent already points   * to the correct parent.   *   * Context: This function expects namespace_lock() and lock_mount_hash()   *          to have been acquired in that order.   */  static void attach_mnt(struct mount *mnt, struct mount *parent, -		       struct mountpoint *mp, bool beneath) +		       struct mountpoint *mp)  { -	if (beneath) -		mnt_set_mountpoint_beneath(mnt, parent, mp); -	else -		mnt_set_mountpoint(parent, mp, mnt); -	/* -	 * Note, @mnt->mnt_parent has to be used. If @mnt was mounted -	 * beneath @parent then @mnt will need to be attached to -	 * @parent's old parent, not @parent. IOW, @mnt->mnt_parent -	 * isn't the same mount as @parent. -	 */ -	__attach_mnt(mnt, mnt->mnt_parent); +	mnt_set_mountpoint(parent, mp, mnt); +	make_visible(mnt);  }  void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)  {  	struct mountpoint *old_mp = mnt->mnt_mp; -	struct mount *old_parent = mnt->mnt_parent;  	list_del_init(&mnt->mnt_child);  	hlist_del_init(&mnt->mnt_mp_list);  	hlist_del_init_rcu(&mnt->mnt_hash); -	attach_mnt(mnt, parent, mp, false); +	attach_mnt(mnt, parent, mp); -	put_mountpoint(old_mp); -	mnt_add_count(old_parent, -1); +	maybe_free_mountpoint(old_mp, &ex_mountpoints);  }  static inline struct mount *node_to_mount(struct rb_node *node) @@ -1197,32 +1162,6 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)  	mnt_notify_add(mnt);  } -/* - * vfsmount lock must be held for write - */ -static void commit_tree(struct mount *mnt) -{ -	struct mount *parent = mnt->mnt_parent; -	struct mount *m; -	LIST_HEAD(head); -	struct mnt_namespace *n = parent->mnt_ns; - -	BUG_ON(parent == mnt); - -	list_add_tail(&head, &mnt->mnt_list); -	while (!list_empty(&head)) { -		m = list_first_entry(&head, typeof(*m), mnt_list); -		list_del(&m->mnt_list); - -		mnt_add_to_ns(n, m); -	} -	n->nr_mounts += n->pending_mounts; -	n->pending_mounts = 0; - -	__attach_mnt(mnt, parent); -	touch_mnt_namespace(n); -} -  static struct mount *next_mnt(struct mount *p, struct mount *root)  {  	struct list_head *next = p->mnt_mounts.next; @@ -1249,6 +1188,27 @@ static struct mount *skip_mnt_tree(struct mount *p)  	return p;  } +/* + * vfsmount lock must be held for write + */ +static void commit_tree(struct mount *mnt) +{ +	struct mnt_namespace *n = mnt->mnt_parent->mnt_ns; + +	if (!mnt_ns_attached(mnt)) { +		for (struct mount *m = mnt; m; m = next_mnt(m, mnt)) +			if (unlikely(mnt_ns_attached(m))) +				m = skip_mnt_tree(m); +			else +				mnt_add_to_ns(n, m); +		n->nr_mounts += n->pending_mounts; +		n->pending_mounts = 0; +	} + +	make_visible(mnt); +	touch_mnt_namespace(n); +} +  /**   * vfs_create_mount - Create a mount for a configured superblock   * @fc: The configuration context with the superblock attached @@ -1296,6 +1256,15 @@ struct vfsmount *fc_mount(struct fs_context *fc)  }  EXPORT_SYMBOL(fc_mount); +struct vfsmount *fc_mount_longterm(struct fs_context *fc) +{ +	struct vfsmount *mnt = fc_mount(fc); +	if (!IS_ERR(mnt)) +		real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; +	return mnt; +} +EXPORT_SYMBOL(fc_mount_longterm); +  struct vfsmount *vfs_kern_mount(struct file_system_type *type,  				int flags, const char *name,  				void *data) @@ -1337,7 +1306,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,  	if (!mnt)  		return ERR_PTR(-ENOMEM); -	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) +	mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) & +			     ~MNT_INTERNAL_FLAGS; + +	if (flag & (CL_SLAVE | CL_PRIVATE))  		mnt->mnt_group_id = 0; /* not a peer of original */  	else  		mnt->mnt_group_id = old->mnt_group_id; @@ -1348,8 +1320,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,  			goto out_free;  	} -	mnt->mnt.mnt_flags = old->mnt.mnt_flags; -	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); +	if (mnt->mnt_group_id) +		set_mnt_shared(mnt);  	atomic_inc(&sb->s_active);  	mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); @@ -1362,30 +1334,19 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,  	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);  	unlock_mount_hash(); -	if ((flag & CL_SLAVE) || -	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { -		list_add(&mnt->mnt_slave, &old->mnt_slave_list); +	if (flag & CL_PRIVATE)	// we are done with it +		return mnt; + +	if (peers(mnt, old)) +		list_add(&mnt->mnt_share, &old->mnt_share); + +	if ((flag & CL_SLAVE) && old->mnt_group_id) { +		hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list);  		mnt->mnt_master = old; -		CLEAR_MNT_SHARED(mnt); -	} else if (!(flag & CL_PRIVATE)) { -		if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) -			list_add(&mnt->mnt_share, &old->mnt_share); -		if (IS_MNT_SLAVE(old)) -			list_add(&mnt->mnt_slave, &old->mnt_slave); +	} else if (IS_MNT_SLAVE(old)) { +		hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave);  		mnt->mnt_master = old->mnt_master; -	} else { -		CLEAR_MNT_SHARED(mnt); -	} -	if (flag & CL_MAKE_SHARED) -		set_mnt_shared(mnt); - -	/* stick the duplicate mount on the same expiry list -	 * as the original if that was on one */ -	if (flag & CL_EXPIRE) { -		if (!list_empty(&old->mnt_expire)) -			list_add(&mnt->mnt_expire, &old->mnt_expire);  	} -  	return mnt;   out_free: @@ -1478,11 +1439,13 @@ static void mntput_no_expire(struct mount *mnt)  	rcu_read_unlock();  	list_del(&mnt->mnt_instance); +	if (unlikely(!list_empty(&mnt->mnt_expire))) +		list_del(&mnt->mnt_expire);  	if (unlikely(!list_empty(&mnt->mnt_mounts))) {  		struct mount *p, *tmp;  		list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) { -			__put_mountpoint(unhash_mnt(p), &list); +			__umount_mnt(p, &list);  			hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);  		}  	} @@ -1679,23 +1642,19 @@ const struct seq_operations mounts_op = {  int may_umount_tree(struct vfsmount *m)  {  	struct mount *mnt = real_mount(m); -	int actual_refs = 0; -	int minimum_refs = 0; -	struct mount *p; -	BUG_ON(!m); +	bool busy = false;  	/* write lock needed for mnt_get_count */  	lock_mount_hash(); -	for (p = mnt; p; p = next_mnt(p, mnt)) { -		actual_refs += mnt_get_count(p); -		minimum_refs += 2; +	for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) { +		if (mnt_get_count(p) > (p == mnt ? 2 : 1)) { +			busy = true; +			break; +		}  	}  	unlock_mount_hash(); -	if (actual_refs > minimum_refs) -		return 0; - -	return 1; +	return !busy;  }  EXPORT_SYMBOL(may_umount_tree); @@ -1771,15 +1730,18 @@ static bool need_notify_mnt_list(void)  }  #endif +static void free_mnt_ns(struct mnt_namespace *);  static void namespace_unlock(void)  {  	struct hlist_head head;  	struct hlist_node *p;  	struct mount *m; +	struct mnt_namespace *ns = emptied_ns;  	LIST_HEAD(list);  	hlist_move_list(&unmounted, &head);  	list_splice_init(&ex_mountpoints, &list); +	emptied_ns = NULL;  	if (need_notify_mnt_list()) {  		/* @@ -1793,6 +1755,11 @@ static void namespace_unlock(void)  	} else {  		up_write(&namespace_sem);  	} +	if (unlikely(ns)) { +		/* Make sure we notice when we leak mounts. */ +		VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); +		free_mnt_ns(ns); +	}  	shrink_dentry_list(&list); @@ -1865,9 +1832,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)  	for (p = mnt; p; p = next_mnt(p, mnt)) {  		p->mnt.mnt_flags |= MNT_UMOUNT;  		if (mnt_ns_attached(p)) -			move_from_ns(p, &tmp_list); -		else -			list_move(&p->mnt_list, &tmp_list); +			move_from_ns(p); +		list_add_tail(&p->mnt_list, &tmp_list);  	}  	/* Hide the mounts from mnt_mounts */ @@ -1896,7 +1862,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)  		disconnect = disconnect_mount(p, how);  		if (mnt_has_parent(p)) { -			mnt_add_count(p->mnt_parent, -1);  			if (!disconnect) {  				/* Don't forget about p */  				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); @@ -1973,7 +1938,7 @@ static int do_umount(struct mount *mnt, int flags)  		 * all race cases, but it's a slowpath.  		 */  		lock_mount_hash(); -		if (mnt_get_count(mnt) != 2) { +		if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) {  			unlock_mount_hash();  			return -EBUSY;  		} @@ -2019,23 +1984,27 @@ static int do_umount(struct mount *mnt, int flags)  	namespace_lock();  	lock_mount_hash(); -	/* Recheck MNT_LOCKED with the locks held */ +	/* Repeat the earlier racy checks, now that we are holding the locks */  	retval = -EINVAL; +	if (!check_mnt(mnt)) +		goto out; +  	if (mnt->mnt.mnt_flags & MNT_LOCKED)  		goto out; +	if (!mnt_has_parent(mnt)) /* not the absolute root */ +		goto out; +  	event++;  	if (flags & MNT_DETACH) { -		if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) -			umount_tree(mnt, UMOUNT_PROPAGATE); +		umount_tree(mnt, UMOUNT_PROPAGATE);  		retval = 0;  	} else {  		smp_mb(); // paired with __legitimize_mnt()  		shrink_submounts(mnt);  		retval = -EBUSY;  		if (!propagate_mount_busy(mnt, 2)) { -			if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) -				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); +			umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);  			retval = 0;  		}  	} @@ -2053,29 +2022,28 @@ out:   * detach_mounts allows lazily unmounting those mounts instead of   * leaking them.   * - * The caller may hold dentry->d_inode->i_mutex. + * The caller may hold dentry->d_inode->i_rwsem.   */  void __detach_mounts(struct dentry *dentry)  { -	struct mountpoint *mp; +	struct pinned_mountpoint mp = {};  	struct mount *mnt;  	namespace_lock();  	lock_mount_hash(); -	mp = lookup_mountpoint(dentry); -	if (!mp) +	if (!lookup_mountpoint(dentry, &mp))  		goto out_unlock;  	event++; -	while (!hlist_empty(&mp->m_list)) { -		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); +	while (mp.node.next) { +		mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list);  		if (mnt->mnt.mnt_flags & MNT_UMOUNT) {  			umount_mnt(mnt);  			hlist_add_head(&mnt->mnt_umount, &unmounted);  		}  		else umount_tree(mnt, UMOUNT_CONNECTED);  	} -	put_mountpoint(mp); +	unpin_mountpoint(&mp);  out_unlock:  	unlock_mount_hash();  	namespace_unlock(); @@ -2259,7 +2227,6 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,  		return dst_mnt;  	src_parent = src_root; -	dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint;  	list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {  		if (!is_subdir(src_root_child->mnt_mountpoint, dentry)) @@ -2294,8 +2261,16 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,  			if (IS_ERR(dst_mnt))  				goto out;  			lock_mount_hash(); -			list_add_tail(&dst_mnt->mnt_list, &res->mnt_list); -			attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false); +			if (src_mnt->mnt.mnt_flags & MNT_LOCKED) +				dst_mnt->mnt.mnt_flags |= MNT_LOCKED; +			if (unlikely(flag & CL_EXPIRE)) { +				/* stick the duplicate mount on the same expiry +				 * list as the original if that was on one */ +				if (!list_empty(&src_mnt->mnt_expire)) +					list_add(&dst_mnt->mnt_expire, +						 &src_mnt->mnt_expire); +			} +			attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp);  			unlock_mount_hash();  		}  	} @@ -2310,104 +2285,94 @@ out:  	return dst_mnt;  } -/* Caller should check returned pointer for errors */ - -struct vfsmount *collect_mounts(const struct path *path) +static inline bool extend_array(struct path **res, struct path **to_free, +				unsigned n, unsigned *count, unsigned new_count)  { -	struct mount *tree; -	namespace_lock(); -	if (!check_mnt(real_mount(path->mnt))) -		tree = ERR_PTR(-EINVAL); -	else -		tree = copy_tree(real_mount(path->mnt), path->dentry, -				 CL_COPY_ALL | CL_PRIVATE); -	namespace_unlock(); -	if (IS_ERR(tree)) -		return ERR_CAST(tree); -	return &tree->mnt; -} +	struct path *p; -static void free_mnt_ns(struct mnt_namespace *); -static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); +	if (likely(n < *count)) +		return true; +	p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL); +	if (p && *count) +		memcpy(p, *res, *count * sizeof(struct path)); +	*count = new_count; +	kfree(*to_free); +	*to_free = *res = p; +	return p; +} -static inline bool must_dissolve(struct mnt_namespace *mnt_ns) +struct path *collect_paths(const struct path *path, +			      struct path *prealloc, unsigned count)  { -	/* -        * This mount belonged to an anonymous mount namespace -        * but was moved to a non-anonymous mount namespace and -        * then unmounted. -        */ -	if (unlikely(!mnt_ns)) -		return false; +	struct mount *root = real_mount(path->mnt); +	struct mount *child; +	struct path *res = prealloc, *to_free = NULL; +	unsigned n = 0; -	/* -        * This mount belongs to a non-anonymous mount namespace -        * and we know that such a mount can never transition to -        * an anonymous mount namespace again. -        */ -	if (!is_anon_ns(mnt_ns)) { -		/* -		 * A detached mount either belongs to an anonymous mount -		 * namespace or a non-anonymous mount namespace. It -		 * should never belong to something purely internal. -		 */ -		VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL); -		return false; +	guard(rwsem_read)(&namespace_sem); + +	if (!check_mnt(root)) +		return ERR_PTR(-EINVAL); +	if (!extend_array(&res, &to_free, 0, &count, 32)) +		return ERR_PTR(-ENOMEM); +	res[n++] = *path; +	list_for_each_entry(child, &root->mnt_mounts, mnt_child) { +		if (!is_subdir(child->mnt_mountpoint, path->dentry)) +			continue; +		for (struct mount *m = child; m; m = next_mnt(m, child)) { +			if (!extend_array(&res, &to_free, n, &count, 2 * count)) +				return ERR_PTR(-ENOMEM); +			res[n].mnt = &m->mnt; +			res[n].dentry = m->mnt.mnt_root; +			n++; +		}  	} +	if (!extend_array(&res, &to_free, n, &count, count + 1)) +		return ERR_PTR(-ENOMEM); +	memset(res + n, 0, (count - n) * sizeof(struct path)); +	for (struct path *p = res; p->mnt; p++) +		path_get(p); +	return res; +} -	return true; +void drop_collected_paths(struct path *paths, struct path *prealloc) +{ +	for (struct path *p = paths; p->mnt; p++) +		path_put(p); +	if (paths != prealloc) +		kfree(paths);  } +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); +  void dissolve_on_fput(struct vfsmount *mnt)  { -	struct mnt_namespace *ns;  	struct mount *m = real_mount(mnt); +	/* +	 * m used to be the root of anon namespace; if it still is one, +	 * we need to dissolve the mount tree and free that namespace. +	 * Let's try to avoid taking namespace_sem if we can determine +	 * that there's nothing to do without it - rcu_read_lock() is +	 * enough to make anon_ns_root() memory-safe and once m has +	 * left its namespace, it's no longer our concern, since it will +	 * never become a root of anon ns again. +	 */ +  	scoped_guard(rcu) { -		if (!must_dissolve(READ_ONCE(m->mnt_ns))) +		if (!anon_ns_root(m))  			return;  	}  	scoped_guard(namespace_lock, &namespace_sem) { -		ns = m->mnt_ns; -		if (!must_dissolve(ns)) -			return; - -		/* -		 * After must_dissolve() we know that this is a detached -		 * mount in an anonymous mount namespace. -		 * -		 * Now when mnt_has_parent() reports that this mount -		 * tree has a parent, we know that this anonymous mount -		 * tree has been moved to another anonymous mount -		 * namespace. -		 * -		 * So when closing this file we cannot unmount the mount -		 * tree. This will be done when the file referring to -		 * the root of the anonymous mount namespace will be -		 * closed (It could already be closed but it would sync -		 * on @namespace_sem and wait for us to finish.). -		 */ -		if (mnt_has_parent(m)) +		if (!anon_ns_root(m))  			return; +		emptied_ns = m->mnt_ns;  		lock_mount_hash();  		umount_tree(m, UMOUNT_CONNECTED);  		unlock_mount_hash();  	} - -	/* Make sure we notice when we leak mounts. */ -	VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); -	free_mnt_ns(ns); -} - -void drop_collected_mounts(struct vfsmount *mnt) -{ -	namespace_lock(); -	lock_mount_hash(); -	umount_tree(real_mount(mnt), 0); -	unlock_mount_hash(); -	namespace_unlock();  }  static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) @@ -2486,9 +2451,7 @@ struct vfsmount *clone_private_mount(const struct path *path)  	 * loops get created.  	 */  	if (!check_mnt(old_mnt)) { -		if (!is_mounted(&old_mnt->mnt) || -			!is_anon_ns(old_mnt->mnt_ns) || -			mnt_has_parent(old_mnt)) +		if (!anon_ns_root(old_mnt))  			return ERR_PTR(-EINVAL);  		if (!check_for_nsfs_mounts(old_mnt)) @@ -2511,21 +2474,6 @@ struct vfsmount *clone_private_mount(const struct path *path)  }  EXPORT_SYMBOL_GPL(clone_private_mount); -int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, -		   struct vfsmount *root) -{ -	struct mount *mnt; -	int res = f(root, arg); -	if (res) -		return res; -	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { -		res = f(&mnt->mnt, arg); -		if (res) -			return res; -	} -	return 0; -} -  static void lock_mnt_tree(struct mount *mnt)  {  	struct mount *p; @@ -2547,7 +2495,7 @@ static void lock_mnt_tree(struct mount *mnt)  		if (flags & MNT_NOEXEC)  			flags |= MNT_LOCK_NOEXEC;  		/* Don't allow unprivileged users to reveal what is under a mount */ -		if (list_empty(&p->mnt_expire)) +		if (list_empty(&p->mnt_expire) && p != mnt)  			flags |= MNT_LOCKED;  		p->mnt.mnt_flags = flags;  	} @@ -2568,7 +2516,7 @@ static int invent_group_ids(struct mount *mnt, bool recurse)  	struct mount *p;  	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { -		if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { +		if (!p->mnt_group_id) {  			int err = mnt_alloc_group_id(p);  			if (err) {  				cleanup_group_ids(mnt, p); @@ -2604,17 +2552,15 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)  }  enum mnt_tree_flags_t { -	MNT_TREE_MOVE = BIT(0), -	MNT_TREE_BENEATH = BIT(1), -	MNT_TREE_PROPAGATION = BIT(2), +	MNT_TREE_BENEATH = BIT(0), +	MNT_TREE_PROPAGATION = BIT(1),  };  /**   * attach_recursive_mnt - attach a source mount tree   * @source_mnt: mount tree to be attached - * @top_mnt:    mount that @source_mnt will be mounted on or mounted beneath + * @dest_mnt:   mount that @source_mnt will be mounted on   * @dest_mp:    the mountpoint @source_mnt will be mounted at - * @flags:      modify how @source_mnt is supposed to be attached   *   *  NOTE: in the table below explains the semantics when a source mount   *  of a given type is attached to a destination mount of a given type. @@ -2677,26 +2623,31 @@ enum mnt_tree_flags_t {   *         Otherwise a negative error code is returned.   */  static int attach_recursive_mnt(struct mount *source_mnt, -				struct mount *top_mnt, -				struct mountpoint *dest_mp, -				enum mnt_tree_flags_t flags) +				struct mount *dest_mnt, +				struct mountpoint *dest_mp)  {  	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;  	HLIST_HEAD(tree_list); -	struct mnt_namespace *ns = top_mnt->mnt_ns; -	struct mountpoint *smp; -	struct mount *child, *dest_mnt, *p; +	struct mnt_namespace *ns = dest_mnt->mnt_ns; +	struct pinned_mountpoint root = {}; +	struct mountpoint *shorter = NULL; +	struct mount *child, *p; +	struct mount *top;  	struct hlist_node *n;  	int err = 0; -	bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH; +	bool moving = mnt_has_parent(source_mnt);  	/*  	 * Preallocate a mountpoint in case the new mounts need to be  	 * mounted beneath mounts on the same mountpoint.  	 */ -	smp = get_mountpoint(source_mnt->mnt.mnt_root); -	if (IS_ERR(smp)) -		return PTR_ERR(smp); +	for (top = source_mnt; unlikely(top->overmount); top = top->overmount) { +		if (!shorter && is_mnt_ns_file(top->mnt.mnt_root)) +			shorter = top->mnt_mp; +	} +	err = get_mountpoint(top->mnt.mnt_root, &root); +	if (err) +		return err;  	/* Is there space to add these mounts to the mount namespace? */  	if (!moving) { @@ -2705,11 +2656,6 @@ static int attach_recursive_mnt(struct mount *source_mnt,  			goto out;  	} -	if (beneath) -		dest_mnt = top_mnt->mnt_parent; -	else -		dest_mnt = top_mnt; -  	if (IS_MNT_SHARED(dest_mnt)) {  		err = invent_group_ids(source_mnt, true);  		if (err) @@ -2726,42 +2672,50 @@ static int attach_recursive_mnt(struct mount *source_mnt,  	}  	if (moving) { -		if (beneath) -			dest_mp = smp; -		unhash_mnt(source_mnt); -		attach_mnt(source_mnt, top_mnt, dest_mp, beneath); +		umount_mnt(source_mnt);  		mnt_notify_add(source_mnt); -		touch_mnt_namespace(source_mnt->mnt_ns); +		/* if the mount is moved, it should no longer be expired +		 * automatically */ +		list_del_init(&source_mnt->mnt_expire);  	} else {  		if (source_mnt->mnt_ns) { -			LIST_HEAD(head); -  			/* move from anon - the caller will destroy */ +			emptied_ns = source_mnt->mnt_ns;  			for (p = source_mnt; p; p = next_mnt(p, source_mnt)) -				move_from_ns(p, &head); -			list_del_init(&head); +				move_from_ns(p);  		} -		if (beneath) -			mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); -		else -			mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); -		commit_tree(source_mnt);  	} +	mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); +	/* +	 * Now the original copy is in the same state as the secondaries - +	 * its root attached to mountpoint, but not hashed and all mounts +	 * in it are either in our namespace or in no namespace at all. +	 * Add the original to the list of copies and deal with the +	 * rest of work for all of them uniformly. +	 */ +	hlist_add_head(&source_mnt->mnt_hash, &tree_list); +  	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {  		struct mount *q;  		hlist_del_init(&child->mnt_hash); -		q = __lookup_mnt(&child->mnt_parent->mnt, -				 child->mnt_mountpoint); -		if (q) -			mnt_change_mountpoint(child, smp, q);  		/* Notice when we are propagating across user namespaces */  		if (child->mnt_parent->mnt_ns->user_ns != user_ns)  			lock_mnt_tree(child); -		child->mnt.mnt_flags &= ~MNT_LOCKED; +		q = __lookup_mnt(&child->mnt_parent->mnt, +				 child->mnt_mountpoint); +		if (q) { +			struct mountpoint *mp = root.mp; +			struct mount *r = child; +			while (unlikely(r->overmount)) +				r = r->overmount; +			if (unlikely(shorter) && child != source_mnt) +				mp = shorter; +			mnt_change_mountpoint(r, mp, q); +		}  		commit_tree(child);  	} -	put_mountpoint(smp); +	unpin_mountpoint(&root);  	unlock_mount_hash();  	return 0; @@ -2778,7 +2732,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,  	ns->pending_mounts = 0;  	read_seqlock_excl(&mount_lock); -	put_mountpoint(smp); +	unpin_mountpoint(&root);  	read_sequnlock_excl(&mount_lock);  	return err; @@ -2818,12 +2772,12 @@ static int attach_recursive_mnt(struct mount *source_mnt,   * Return: Either the target mountpoint on the top mount or the top   *         mount's mountpoint.   */ -static struct mountpoint *do_lock_mount(struct path *path, bool beneath) +static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath)  {  	struct vfsmount *mnt = path->mnt;  	struct dentry *dentry; -	struct mountpoint *mp = ERR_PTR(-ENOENT);  	struct path under = {}; +	int err = -ENOENT;  	for (;;) {  		struct mount *m = real_mount(mnt); @@ -2861,8 +2815,8 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath)  			path->dentry = dget(mnt->mnt_root);  			continue;	// got overmounted  		} -		mp = get_mountpoint(dentry); -		if (IS_ERR(mp)) +		err = get_mountpoint(dentry, pinned); +		if (err)  			break;  		if (beneath) {  			/* @@ -2873,25 +2827,25 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath)  			 */  			path_put(&under);  		} -		return mp; +		return 0;  	}  	namespace_unlock();  	inode_unlock(dentry->d_inode);  	if (beneath)  		path_put(&under); -	return mp; +	return err;  } -static inline struct mountpoint *lock_mount(struct path *path) +static inline int lock_mount(struct path *path, struct pinned_mountpoint *m)  { -	return do_lock_mount(path, false); +	return do_lock_mount(path, m, false);  } -static void unlock_mount(struct mountpoint *where) +static void unlock_mount(struct pinned_mountpoint *m)  { -	inode_unlock(where->m_dentry->d_inode); +	inode_unlock(m->mp->m_dentry->d_inode);  	read_seqlock_excl(&mount_lock); -	put_mountpoint(where); +	unpin_mountpoint(m);  	read_sequnlock_excl(&mount_lock);  	namespace_unlock();  } @@ -2905,7 +2859,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)  	      d_is_dir(mnt->mnt.mnt_root))  		return -ENOTDIR; -	return attach_recursive_mnt(mnt, p, mp, 0); +	return attach_recursive_mnt(mnt, p, mp);  }  /* @@ -2954,10 +2908,8 @@ static int do_change_type(struct path *path, int ms_flags)  			goto out_unlock;  	} -	lock_mount_hash();  	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))  		change_mnt_propagation(m, type); -	unlock_mount_hash();   out_unlock:  	namespace_unlock(); @@ -3031,26 +2983,21 @@ static inline bool may_copy_tree(struct path *path)  static struct mount *__do_loopback(struct path *old_path, int recurse)  { -	struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); +	struct mount *old = real_mount(old_path->mnt);  	if (IS_MNT_UNBINDABLE(old)) -		return mnt; +		return ERR_PTR(-EINVAL);  	if (!may_copy_tree(old_path)) -		return mnt; +		return ERR_PTR(-EINVAL);  	if (!recurse && __has_locked_children(old, old_path->dentry)) -		return mnt; +		return ERR_PTR(-EINVAL);  	if (recurse) -		mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); +		return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);  	else -		mnt = clone_mnt(old, old_path->dentry, 0); - -	if (!IS_ERR(mnt)) -		mnt->mnt.mnt_flags &= ~MNT_LOCKED; - -	return mnt; +		return clone_mnt(old, old_path->dentry, 0);  }  /* @@ -3061,7 +3008,7 @@ static int do_loopback(struct path *path, const char *old_name,  {  	struct path old_path;  	struct mount *mnt = NULL, *parent; -	struct mountpoint *mp; +	struct pinned_mountpoint mp = {};  	int err;  	if (!old_name || !*old_name)  		return -EINVAL; @@ -3073,11 +3020,9 @@ static int do_loopback(struct path *path, const char *old_name,  	if (mnt_ns_loop(old_path.dentry))  		goto out; -	mp = lock_mount(path); -	if (IS_ERR(mp)) { -		err = PTR_ERR(mp); +	err = lock_mount(path, &mp); +	if (err)  		goto out; -	}  	parent = real_mount(path->mnt);  	if (!check_mnt(parent)) @@ -3089,14 +3034,14 @@ static int do_loopback(struct path *path, const char *old_name,  		goto out2;  	} -	err = graft_tree(mnt, parent, mp); +	err = graft_tree(mnt, parent, mp.mp);  	if (err) {  		lock_mount_hash();  		umount_tree(mnt, UMOUNT_SYNC);  		unlock_mount_hash();  	}  out2: -	unlock_mount(mp); +	unlock_mount(&mp);  out:  	path_put(&old_path);  	return err; @@ -3444,18 +3389,14 @@ static int do_set_group(struct path *from_path, struct path *to_path)  		goto out;  	if (IS_MNT_SLAVE(from)) { -		struct mount *m = from->mnt_master; - -		list_add(&to->mnt_slave, &from->mnt_slave); -		to->mnt_master = m; +		hlist_add_behind(&to->mnt_slave, &from->mnt_slave); +		to->mnt_master = from->mnt_master;  	}  	if (IS_MNT_SHARED(from)) {  		to->mnt_group_id = from->mnt_group_id;  		list_add(&to->mnt_share, &from->mnt_share); -		lock_mount_hash();  		set_mnt_shared(to); -		unlock_mount_hash();  	}  	err = 0; @@ -3492,6 +3433,17 @@ static inline bool path_overmounted(const struct path *path)  	return unlikely(!no_child);  } +/* + * Check if there is a possibly empty chain of descent from p1 to p2. + * Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl). + */ +static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) +{ +	while (p2 != p1 && mnt_has_parent(p2)) +		p2 = p2->mnt_parent; +	return p2 == p1; +} +  /**   * can_move_mount_beneath - check that we can mount beneath the top mount   * @from: mount to mount beneath @@ -3543,9 +3495,8 @@ static int can_move_mount_beneath(const struct path *from,  	if (parent_mnt_to == current->nsproxy->mnt_ns->root)  		return -EINVAL; -	for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent) -		if (p == mnt_to) -			return -EINVAL; +	if (mount_is_ancestor(mnt_to, mnt_from)) +		return -EINVAL;  	/*  	 * If the parent mount propagates to the child mount this would @@ -3630,27 +3581,20 @@ static int do_move_mount(struct path *old_path,  	struct mount *p;  	struct mount *old;  	struct mount *parent; -	struct mountpoint *mp, *old_mp; +	struct pinned_mountpoint mp;  	int err; -	bool attached, beneath = flags & MNT_TREE_BENEATH; +	bool beneath = flags & MNT_TREE_BENEATH; -	mp = do_lock_mount(new_path, beneath); -	if (IS_ERR(mp)) -		return PTR_ERR(mp); +	err = do_lock_mount(new_path, &mp, beneath); +	if (err) +		return err;  	old = real_mount(old_path->mnt);  	p = real_mount(new_path->mnt);  	parent = old->mnt_parent; -	attached = mnt_has_parent(old); -	if (attached) -		flags |= MNT_TREE_MOVE; -	old_mp = old->mnt_mp;  	ns = old->mnt_ns;  	err = -EINVAL; -	/* The thing moved must be mounted... */ -	if (!is_mounted(&old->mnt)) -		goto out;  	if (check_mnt(old)) {  		/* if the source is in our namespace... */ @@ -3660,13 +3604,14 @@ static int do_move_mount(struct path *old_path,  		/* ... and the target should be in our namespace */  		if (!check_mnt(p))  			goto out; +		/* parent of the source should not be shared */ +		if (IS_MNT_SHARED(parent)) +			goto out;  	} else {  		/*  		 * otherwise the source must be the root of some anon namespace. -		 * AV: check for mount being root of an anon namespace is worth -		 * an inlined predicate...  		 */ -		if (!is_anon_ns(ns) || mnt_has_parent(old)) +		if (!anon_ns_root(old))  			goto out;  		/*  		 * Bail out early if the target is within the same namespace - @@ -3689,20 +3634,14 @@ static int do_move_mount(struct path *old_path,  	if (d_is_dir(new_path->dentry) !=  	    d_is_dir(old_path->dentry))  		goto out; -	/* -	 * Don't move a mount residing in a shared parent. -	 */ -	if (attached && IS_MNT_SHARED(parent)) -		goto out;  	if (beneath) { -		err = can_move_mount_beneath(old_path, new_path, mp); +		err = can_move_mount_beneath(old_path, new_path, mp.mp);  		if (err)  			goto out;  		err = -EINVAL;  		p = p->mnt_parent; -		flags |= MNT_TREE_BENEATH;  	}  	/* @@ -3714,30 +3653,12 @@ static int do_move_mount(struct path *old_path,  	err = -ELOOP;  	if (!check_for_nsfs_mounts(old))  		goto out; -	for (; mnt_has_parent(p); p = p->mnt_parent) -		if (p == old) -			goto out; - -	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags); -	if (err) +	if (mount_is_ancestor(old, p))  		goto out; -	/* if the mount is moved, it should no longer be expire -	 * automatically */ -	list_del_init(&old->mnt_expire); -	if (attached) -		put_mountpoint(old_mp); +	err = attach_recursive_mnt(old, p, mp.mp);  out: -	unlock_mount(mp); -	if (!err) { -		if (attached) { -			mntput_no_expire(parent); -		} else { -			/* Make sure we notice when we leak mounts. */ -			VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); -			free_mnt_ns(ns); -		} -	} +	unlock_mount(&mp);  	return err;  } @@ -3798,7 +3719,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,  			   unsigned int mnt_flags)  {  	struct vfsmount *mnt; -	struct mountpoint *mp; +	struct pinned_mountpoint mp = {};  	struct super_block *sb = fc->root->d_sb;  	int error; @@ -3819,13 +3740,12 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,  	mnt_warn_timestamp_expiry(mountpoint, mnt); -	mp = lock_mount(mountpoint); -	if (IS_ERR(mp)) { -		mntput(mnt); -		return PTR_ERR(mp); +	error = lock_mount(mountpoint, &mp); +	if (!error) { +		error = do_add_mount(real_mount(mnt), mp.mp, +				     mountpoint, mnt_flags); +		unlock_mount(&mp);  	} -	error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags); -	unlock_mount(mp);  	if (error < 0)  		mntput(mnt);  	return error; @@ -3893,7 +3813,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,  int finish_automount(struct vfsmount *m, const struct path *path)  {  	struct dentry *dentry = path->dentry; -	struct mountpoint *mp; +	struct pinned_mountpoint mp = {};  	struct mount *mnt;  	int err; @@ -3925,14 +3845,13 @@ int finish_automount(struct vfsmount *m, const struct path *path)  		err = 0;  		goto discard_locked;  	} -	mp = get_mountpoint(dentry); -	if (IS_ERR(mp)) { -		err = PTR_ERR(mp); +	err = get_mountpoint(dentry, &mp); +	if (err)  		goto discard_locked; -	} -	err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); -	unlock_mount(mp); +	err = do_add_mount(mnt, mp.mp, path, +			   path->mnt->mnt_flags | MNT_SHRINKABLE); +	unlock_mount(&mp);  	if (unlikely(err))  		goto discard;  	return 0; @@ -3941,12 +3860,6 @@ discard_locked:  	namespace_unlock();  	inode_unlock(dentry->d_inode);  discard: -	/* remove m from any expiration list it may be on */ -	if (!list_empty(&mnt->mnt_expire)) { -		namespace_lock(); -		list_del_init(&mnt->mnt_expire); -		namespace_unlock(); -	}  	mntput(m);  	return err;  } @@ -3958,11 +3871,9 @@ discard:   */  void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)  { -	namespace_lock(); - +	read_seqlock_excl(&mount_lock);  	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); - -	namespace_unlock(); +	read_sequnlock_excl(&mount_lock);  }  EXPORT_SYMBOL(mnt_set_expiry); @@ -4316,7 +4227,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,  	/* First pass: copy the tree topology */  	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;  	if (user_ns != ns->user_ns) -		copy_flags |= CL_SHARED_TO_SLAVE; +		copy_flags |= CL_SLAVE;  	new = copy_tree(old, old->mnt.mnt_root, copy_flags);  	if (IS_ERR(new)) {  		namespace_unlock(); @@ -4741,7 +4652,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,  {  	struct path new, old, root;  	struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; -	struct mountpoint *old_mp, *root_mp; +	struct pinned_mountpoint old_mp = {};  	int error;  	if (!may_mount()) @@ -4762,9 +4673,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,  		goto out2;  	get_fs_root(current->fs, &root); -	old_mp = lock_mount(&old); -	error = PTR_ERR(old_mp); -	if (IS_ERR(old_mp)) +	error = lock_mount(&old, &old_mp); +	if (error)  		goto out3;  	error = -EINVAL; @@ -4791,11 +4701,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,  	if (!path_mounted(&root))  		goto out4; /* not a mountpoint */  	if (!mnt_has_parent(root_mnt)) -		goto out4; /* not attached */ +		goto out4; /* absolute root */  	if (!path_mounted(&new))  		goto out4; /* not a mountpoint */  	if (!mnt_has_parent(new_mnt)) -		goto out4; /* not attached */ +		goto out4; /* absolute root */  	/* make sure we can reach put_old from new_root */  	if (!is_path_reachable(old_mnt, old.dentry, &new))  		goto out4; @@ -4804,29 +4714,25 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,  		goto out4;  	lock_mount_hash();  	umount_mnt(new_mnt); -	root_mp = unhash_mnt(root_mnt);  /* we'll need its mountpoint */  	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {  		new_mnt->mnt.mnt_flags |= MNT_LOCKED;  		root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;  	} -	/* mount old root on put_old */ -	attach_mnt(root_mnt, old_mnt, old_mp, false);  	/* mount new_root on / */ -	attach_mnt(new_mnt, root_parent, root_mp, false); -	mnt_add_count(root_parent, -1); +	attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp); +	umount_mnt(root_mnt); +	/* mount old root on put_old */ +	attach_mnt(root_mnt, old_mnt, old_mp.mp);  	touch_mnt_namespace(current->nsproxy->mnt_ns);  	/* A moved mount should not expire automatically */  	list_del_init(&new_mnt->mnt_expire); -	put_mountpoint(root_mp);  	unlock_mount_hash();  	mnt_notify_add(root_mnt);  	mnt_notify_add(new_mnt);  	chroot_fs_refs(&root, &new);  	error = 0;  out4: -	unlock_mount(old_mp); -	if (!error) -		mntput_no_expire(ex_parent); +	unlock_mount(&old_mp);  out3:  	path_put(&root);  out2: @@ -5028,22 +4934,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)  	err = -EINVAL;  	lock_mount_hash(); -	/* Ensure that this isn't anything purely vfs internal. */ -	if (!is_mounted(&mnt->mnt)) -		goto out; - -	/* -	 * If this is an attached mount make sure it's located in the callers -	 * mount namespace. If it's not don't let the caller interact with it. -	 * -	 * If this mount doesn't have a parent it's most often simply a -	 * detached mount with an anonymous mount namespace. IOW, something -	 * that's simply not attached yet. But there are apparently also users -	 * that do change mount properties on the rootfs itself. That obviously -	 * neither has a parent nor is it a detached mount so we cannot -	 * unconditionally check for detached mounts. -	 */ -	if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt)) +	if (!anon_ns_root(mnt) && !check_mnt(mnt))  		goto out;  	/* @@ -5290,16 +5181,12 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,  			kattr.kflags |= MOUNT_KATTR_RECURSE;  		ret = wants_mount_setattr(uattr, usize, &kattr); -		if (ret < 0) -			return ret; - -		if (ret) { +		if (ret > 0) {  			ret = do_mount_setattr(&file->f_path, &kattr); -			if (ret) -				return ret; -  			finish_mount_kattr(&kattr);  		} +		if (ret) +			return ret;  	}  	fd = get_unused_fd_flags(flags & O_CLOEXEC); @@ -5411,7 +5298,7 @@ static void statmount_mnt_basic(struct kstatmount *s)  	s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;  	s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);  	s->sm.mnt_propagation = mnt_to_propagation_flags(m); -	s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0; +	s->sm.mnt_peer_group = m->mnt_group_id;  	s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;  } @@ -6217,7 +6104,6 @@ static void __init init_mount_tree(void)  	root.mnt = mnt;  	root.dentry = mnt->mnt_root; -	mnt->mnt_flags |= MNT_LOCKED;  	set_fs_pwd(current->fs, &root);  	set_fs_root(current->fs, &root); @@ -6264,8 +6150,12 @@ void put_mnt_ns(struct mnt_namespace *ns)  {  	if (!refcount_dec_and_test(&ns->ns.count))  		return; -	drop_collected_mounts(&ns->root->mnt); -	free_mnt_ns(ns); +	namespace_lock(); +	emptied_ns = ns; +	lock_mount_hash(); +	umount_tree(ns->root, 0); +	unlock_mount_hash(); +	namespace_unlock();  }  struct vfsmount *kern_mount(struct file_system_type *type)  | 
