fs: allow detached mounts in clone_private_mount()

In container workloads idmapped mounts are often used as layers for overlayfs. Recently I added the ability to specify layers in overlayfs as file descriptors instead of path names. It should be possible to simply use the detached mounts directly when specifying layers instead of having to attach them beforehand. They are discarded after overlayfs is mounted anyway so it's pointless system calls for userspace and pointless locking for the kernel. This just recently come up again in [1]. So enable clone_private_mount() to use detached mounts directly. Following conditions must be met: - Provided path must be the root of a detached mount tree. - Provided path may not create mount namespace loops. - Provided path must be mounted. It would be possible to be stricter and require that the caller must have CAP_SYS_ADMIN in the owning user namespace of the anonymous mount namespace but since this restriction isn't enforced for move_mount() there's no point in enforcing it for clone_private_mount(). This contains a folded fix for: Reported-by: syzbot+62dfea789a2cedac1298@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=62dfea789a2cedac1298 provided by Lizhi Xu <lizhi.xu@windriver.com> in [2]. Link: https://lore.kernel.org/r/20250207071331.550952-1-lizhi.xu@windriver.com [2] Link: https://lore.kernel.org/r/fd8f6574-f737-4743-b220-79c815ee1554@mbaynton.com [1] Link: https://lore.kernel.org/r/20250123-avancieren-erfreuen-3d61f6588fdd@brauner Tested-by: Mike Baynton <mike@mbaynton.com> Signed-off-by: Christian Brauner <brauner@kernel.org>
author: Christian Brauner <brauner@kernel.org> 2025-01-23 20:19:48 +0100
committer: Christian Brauner <brauner@kernel.org> 2025-02-12 12:12:24 +0100
commit: db04662e2f4fced10b93032c65ff03caaae21053 (patch)
tree: b53c58b129caebd1c4c0212b2648c00c0cd609eb /fs/namespace.c
parent: 29349a3d6da3be3210a2d0be3a17433dd10f9a40 (diff)
1 files changed, 43 insertions, 35 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 05f6680e2251..d470a369d42b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2369,6 +2369,28 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
 	return false;
 }
 
+/*
+ * Check that there aren't references to earlier/same mount namespaces in the
+ * specified subtree.  Such references can act as pins for mount namespaces
+ * that aren't checked by the mount-cycle checking code, thereby allowing
+ * cycles to be made.
+ */
+static bool check_for_nsfs_mounts(struct mount *subtree)
+{
+	struct mount *p;
+	bool ret = false;
+
+	lock_mount_hash();
+	for (p = subtree; p; p = next_mnt(p, subtree))
+		if (mnt_ns_loop(p->mnt.mnt_root))
+			goto out;
+
+	ret = true;
+out:
+	unlock_mount_hash();
+	return ret;
+}
+
 /**
  * clone_private_mount - create a private clone of a path
  * @path: path to clone
@@ -2377,6 +2399,8 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
  * will not be attached anywhere in the namespace and will be private (i.e.
  * changes to the originating mount won't be propagated into this).
  *
+ * This assumes caller has called or done the equivalent of may_mount().
+ *
  * Release with mntput().
  */
 struct vfsmount *clone_private_mount(const struct path *path)
@@ -2384,30 +2408,36 @@ struct vfsmount *clone_private_mount(const struct path *path)
 	struct mount *old_mnt = real_mount(path->mnt);
 	struct mount *new_mnt;
 
-	down_read(&namespace_sem);
+	scoped_guard(rwsem_read, &namespace_sem)
 	if (IS_MNT_UNBINDABLE(old_mnt))
-		goto invalid;
+		return ERR_PTR(-EINVAL);
+
+	if (mnt_has_parent(old_mnt)) {
+		if (!check_mnt(old_mnt))
+			return ERR_PTR(-EINVAL);
+	} else {
+		if (!is_mounted(&old_mnt->mnt))
+			return ERR_PTR(-EINVAL);
 
-	if (!check_mnt(old_mnt))
-		goto invalid;
+		/* Make sure this isn't something purely kernel internal. */
+		if (!is_anon_ns(old_mnt->mnt_ns))
+			return ERR_PTR(-EINVAL);
+
+		/* Make sure we don't create mount namespace loops. */
+		if (!check_for_nsfs_mounts(old_mnt))
+			return ERR_PTR(-EINVAL);
+	}
 
 	if (has_locked_children(old_mnt, path->dentry))
-		goto invalid;
+		return ERR_PTR(-EINVAL);
 
 	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
-	up_read(&namespace_sem);
-
 	if (IS_ERR(new_mnt))
-		return ERR_CAST(new_mnt);
+		return ERR_PTR(-EINVAL);
 
 	/* Longterm mount to be removed by kern_unmount*() */
 	new_mnt->mnt_ns = MNT_NS_INTERNAL;
-
 	return &new_mnt->mnt;
-
-invalid:
-	up_read(&namespace_sem);
-	return ERR_PTR(-EINVAL);
 }
 EXPORT_SYMBOL_GPL(clone_private_mount);
 
@@ -3206,28 +3236,6 @@ static inline int tree_contains_unbindable(struct mount *mnt)
 	return 0;
 }
 
-/*
- * Check that there aren't references to earlier/same mount namespaces in the
- * specified subtree.  Such references can act as pins for mount namespaces
- * that aren't checked by the mount-cycle checking code, thereby allowing
- * cycles to be made.
- */
-static bool check_for_nsfs_mounts(struct mount *subtree)
-{
-	struct mount *p;
-	bool ret = false;
-
-	lock_mount_hash();
-	for (p = subtree; p; p = next_mnt(p, subtree))
-		if (mnt_ns_loop(p->mnt.mnt_root))
-			goto out;
-
-	ret = true;
-out:
-	unlock_mount_hash();
-	return ret;
-}
-
 static int do_set_group(struct path *from_path, struct path *to_path)
 {
 	struct mount *from, *to;
author	Christian Brauner <brauner@kernel.org>	2025-01-23 20:19:48 +0100
committer	Christian Brauner <brauner@kernel.org>	2025-02-12 12:12:24 +0100
commit	db04662e2f4fced10b93032c65ff03caaae21053 (patch)
tree	b53c58b129caebd1c4c0212b2648c00c0cd609eb /fs/namespace.c
parent	29349a3d6da3be3210a2d0be3a17433dd10f9a40 (diff)