summaryrefslogtreecommitdiff
path: root/fs/namespace.c
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2025-02-05 14:14:33 +0100
committerChristian Brauner <brauner@kernel.org>2025-02-12 12:12:34 +0100
commit7a54947e727b6df840780a66c970395ed9734ebe (patch)
treee69aecb0a388e7256fabe04fe00bfabdd49d511b /fs/namespace.c
parent5a432de15fad05842c14b174bd6ca846ea9fbb97 (diff)
parent2462651ffa76b87f9c2e4403ef6e6b89b703fb2f (diff)
Merge patch series "fs: allow changing idmappings"
Christian Brauner <brauner@kernel.org> says: Currently, it isn't possible to change the idmapping of an idmapped mount. This is becoming an obstacle for various use-cases. /* idmapped home directories with systemd-homed */ On newer systems /home is can be an idmapped mount such that each file on disk is owned by 65536 and a subfolder exists for foreign id ranges such as containers. For example, a home directory might look like this (using an arbitrary folder as an example): user1@localhost:~/data/mount-idmapped$ ls -al /data/ total 16 drwxrwxrwx 1 65536 65536 36 Jan 27 12:15 . drwxrwxr-x 1 root root 184 Jan 27 12:06 .. -rw-r--r-- 1 65536 65536 0 Jan 27 12:07 aaa -rw-r--r-- 1 65536 65536 0 Jan 27 12:07 bbb -rw-r--r-- 1 65536 65536 0 Jan 27 12:07 cc drwxr-xr-x 1 2147352576 2147352576 0 Jan 27 19:06 containers When logging in home is mounted as an idmapped mount with the following idmappings: 65536:$(id -u):1 // uid mapping 65536:$(id -g):1 // gid mapping 2147352576:2147352576:65536 // uid mapping 2147352576:2147352576:65536 // gid mapping So for a user with uid/gid 1000 an idmapped /home would like like this: user1@localhost:~/data/mount-idmapped$ ls -aln /mnt/ total 16 drwxrwxrwx 1 1000 1000 36 Jan 27 12:15 . drwxrwxr-x 1 0 0 184 Jan 27 12:06 .. -rw-r--r-- 1 1000 1000 0 Jan 27 12:07 aaa -rw-r--r-- 1 1000 1000 0 Jan 27 12:07 bbb -rw-r--r-- 1 1000 1000 0 Jan 27 12:07 cc drwxr-xr-x 1 2147352576 2147352576 0 Jan 27 19:06 containers In other words, 65536 is mapped to the user's uid/gid and the range 2147352576 up to 2147352576 + 65536 is an identity mapping for containers. When a container is started a transient uid/gid range is allocated outside of both mappings of the idmapped mount. For example, the container might get the idmapping: $ cat /proc/1742611/uid_map 0 537985024 65536 This container will be allowed to write to disk within the allocated foreign id range 2147352576 to 2147352576 + 65536. To do this an idmapped mount must be created from an already idmapped mount such that: - The mappings for the user's uid/gid must be dropped, i.e., the following mappings are removed: 65536:$(id -u):1 // uid mapping 65536:$(id -g):1 // gid mapping - A mapping for the transient uid/gid range to the foreign uid/gid range is added: 2147352576:537985024:65536 In combination this will mean that the container will write to disk within the foreign id range 2147352576 to 2147352576 + 65536. /* nested containers */ When the outer container makes use of idmapped mounts it isn't posssible to create an idmapped mount for the inner container with a differen idmapping from the outer container's idmapped mount. There are other usecases and the two above just serve as an illustration of the problem. This patchset makes it possible to create a new idmapped mount from an already idmapped mount. It aims to adhere to current performance constraints and requirements: - Idmapped mounts aim to have near zero performance implications for path lookup. That is why no refernce counting, locking or any other mechanism can be required that would impact performance. This works be ensuring that a regular mount transitions to an idmapped mount once going from a static nop_mnt_idmap mapping to a non-static idmapping. - The idmapping of a mount change anymore for the lifetime of the mount afterwards. This not just avoids UAF issues it also avoids pitfalls such as generating non-matching uid/gid values. Changing idmappings could be solved by: - Idmappings could simply be reference counted (above the simple reference count when sharing them across multiple mounts). This would require pairing mnt_idmap_get() with mnt_idmap_put() which would end up being sprinkled everywhere into the VFS and some filesystems that access idmappings directly. It wouldn't just be quite ugly and introduce new complexity it would have a noticeable performance impact. - Idmappings could gain RCU protection. This would help the LOOKUP_RCU case and avoids taking reference counts under RCU. When not under LOOKUP_RCU reference counts need to be acquired on each idmapping. This would require pairing mnt_idmap_get() with mnt_idmap_put() which would end up being sprinkled everywhere into the VFS and some filesystems that access idmappings directly. This would have the same downsides as mentioned earlier. - The earlier solutions work by updating the mnt->mnt_idmap pointer with the new idmapping. Instead of this it would be possible to change the idmapping itself to avoid UAF issues. To do this a sequence counter would have to be added to struct mount. When retrieving the idmapping to generate uid/gid values the sequence counter would need to be sampled and the generation of the uid/gid would spin until the update of the idmap is finished. This has problems as well but the biggest issue will be that this can lead to inconsistent permission checking and inconsistent uid/gid pairs even more than this is already possible today. Specifically, during creation it could happen that: idmap = mnt_idmap(mnt); inode_permission(idmap, ...); may_create(idmap); // create file with uid/gid based on @idmap in between the permission checking and the generation of the uid/gid value the idmapping could change leading to the permission checking and uid/gid value that is actually used to create a file on disk being out of sync. Similarly if two values are generated like: idmap = mnt_idmap(mnt) vfsgid = make_vfsgid(idmap); // idmapping gets update concurrently vfsuid = make_vfsuid(idmap); @vfsgid and @vfsuid could be out of sync if the idmapping was changed in between. The generation of vfsgid/vfsuid could span a lot of codelines so to guard against this a sequence count would have to be passed around. The performance impact of this solutio are less clear but very likely not zero. - Using SRCU similar to fanotify that can sleep. I find that not just ugly but it would have memory consumption implications and is overall pretty ugly. /* solution */ So, to avoid all of these pitfalls creating an idmapped mount from an already idmapped mount will be done atomically, i.e., a new detached mount is created and a new set of mount properties applied to it without it ever having been exposed to userspace at all. This can be done in two ways. A new flag to open_tree() is added OPEN_TREE_CLEAR_IDMAP that clears the old idmapping and returns a mount that isn't idmapped. And then it is possible to set mount attributes on it again including creation of an idmapped mount. This has the consequence that a file descriptor must exist in userspace that doesn't have any idmapping applied and it will thus never work in unpriviledged scenarios. As a container would be able to remove the idmapping of the mount it has been given. That should be avoided. Instead, we add open_tree_attr() which works just like open_tree() but takes an optional struct mount_attr parameter. This is useful beyond idmappings as it fills a gap where a mount never exists in userspace without the necessary mount properties applied. This is particularly useful for mount options such as MOUNT_ATTR_{RDONLY,NOSUID,NODEV,NOEXEC}. To create a new idmapped mount the following works: // Create a first idmapped mount struct mount_attr attr = { .attr_set = MOUNT_ATTR_IDMAP .userns_fd = fd_userns }; fd_tree = open_tree(-EBADF, "/", OPEN_TREE_CLONE, &attr, sizeof(attr)); move_mount(fd_tree, "", -EBADF, "/mnt", MOVE_MOUNT_F_EMPTY_PATH); // Create a second idmapped mount from the first idmapped mount attr.attr_set = MOUNT_ATTR_IDMAP; attr.userns_fd = fd_userns2; fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr)); // Create a second non-idmapped mount from the first idmapped mount: memset(&attr, 0, sizeof(attr)); attr.attr_clr = MOUNT_ATTR_IDMAP; fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr)); * patches from https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-0-c25feb0d2eb3@kernel.org: fs: allow changing idmappings fs: add kflags member to struct mount_kattr fs: add open_tree_attr() fs: add copy_mount_setattr() helper fs: add vfs_open_tree() helper Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-0-c25feb0d2eb3@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c252
1 files changed, 171 insertions, 81 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 967ee2a5bacf..01fb1074c4c9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -87,12 +87,17 @@ LIST_HEAD(notify_list); /* protected by namespace_sem */
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
+enum mount_kattr_flags_t {
+ MOUNT_KATTR_RECURSE = (1 << 0),
+ MOUNT_KATTR_IDMAP_REPLACE = (1 << 1),
+};
+
struct mount_kattr {
unsigned int attr_set;
unsigned int attr_clr;
unsigned int propagation;
unsigned int lookup_flags;
- bool recurse;
+ enum mount_kattr_flags_t kflags;
struct user_namespace *mnt_userns;
struct mnt_idmap *mnt_idmap;
};
@@ -3002,24 +3007,22 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
return file;
}
-SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
+static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
{
- struct file *file;
- struct path path;
+ int ret;
+ struct path path __free(path_put) = {};
int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
bool detached = flags & OPEN_TREE_CLONE;
- int error;
- int fd;
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
OPEN_TREE_CLOEXEC))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if (flags & AT_NO_AUTOMOUNT)
lookup_flags &= ~LOOKUP_AUTOMOUNT;
@@ -3029,27 +3032,32 @@ SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, fl
lookup_flags |= LOOKUP_EMPTY;
if (detached && !may_mount())
- return -EPERM;
+ return ERR_PTR(-EPERM);
+
+ ret = user_path_at(dfd, filename, lookup_flags, &path);
+ if (unlikely(ret))
+ return ERR_PTR(ret);
+
+ if (detached)
+ return open_detached_copy(&path, flags & AT_RECURSIVE);
+
+ return dentry_open(&path, O_PATH, current_cred());
+}
+
+SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
+{
+ int fd;
+ struct file *file __free(fput) = NULL;
+
+ file = vfs_open_tree(dfd, filename, flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
fd = get_unused_fd_flags(flags & O_CLOEXEC);
if (fd < 0)
return fd;
- error = user_path_at(dfd, filename, lookup_flags, &path);
- if (unlikely(error)) {
- file = ERR_PTR(error);
- } else {
- if (detached)
- file = open_detached_copy(&path, flags & AT_RECURSIVE);
- else
- file = dentry_open(&path, O_PATH, current_cred());
- path_put(&path);
- }
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- return PTR_ERR(file);
- }
- fd_install(fd, file);
+ fd_install(fd, no_free_ptr(file));
return fd;
}
@@ -4605,11 +4613,10 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
return -EINVAL;
/*
- * Once a mount has been idmapped we don't allow it to change its
- * mapping. It makes things simpler and callers can just create
- * another bind-mount they can idmap if they want to.
+ * We only allow an mount to change it's idmapping if it has
+ * never been accessible to userspace.
*/
- if (is_idmapped_mnt(m))
+ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m))
return -EPERM;
/* The underlying filesystem doesn't support idmapped mounts yet. */
@@ -4669,7 +4676,7 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
break;
}
- if (!kattr->recurse)
+ if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
return 0;
}
@@ -4699,18 +4706,16 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
+ struct mnt_idmap *old_idmap;
+
if (!kattr->mnt_idmap)
return;
- /*
- * Pairs with smp_load_acquire() in mnt_idmap().
- *
- * Since we only allow a mount to change the idmapping once and
- * verified this in can_idmap_mount() we know that the mount has
- * @nop_mnt_idmap attached to it. So there's no need to drop any
- * references.
- */
+ old_idmap = mnt_idmap(&mnt->mnt);
+
+ /* Pairs with smp_load_acquire() in mnt_idmap(). */
smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
+ mnt_idmap_put(old_idmap);
}
static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
@@ -4730,7 +4735,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
if (kattr->propagation)
change_mnt_propagation(m, kattr->propagation);
- if (!kattr->recurse)
+ if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
break;
}
touch_mnt_namespace(mnt->mnt_ns);
@@ -4760,7 +4765,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
*/
namespace_lock();
if (kattr->propagation == MS_SHARED) {
- err = invent_group_ids(mnt, kattr->recurse);
+ err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE);
if (err) {
namespace_unlock();
return err;
@@ -4811,7 +4816,7 @@ out:
}
static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
- struct mount_kattr *kattr, unsigned int flags)
+ struct mount_kattr *kattr)
{
struct ns_common *ns;
struct user_namespace *mnt_userns;
@@ -4819,13 +4824,23 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
return 0;
- /*
- * We currently do not support clearing an idmapped mount. If this ever
- * is a use-case we can revisit this but for now let's keep it simple
- * and not allow it.
- */
- if (attr->attr_clr & MOUNT_ATTR_IDMAP)
- return -EINVAL;
+ if (attr->attr_clr & MOUNT_ATTR_IDMAP) {
+ /*
+ * We can only remove an idmapping if it's never been
+ * exposed to userspace.
+ */
+ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE))
+ return -EINVAL;
+
+ /*
+ * Removal of idmappings is equivalent to setting
+ * nop_mnt_idmap.
+ */
+ if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) {
+ kattr->mnt_idmap = &nop_mnt_idmap;
+ return 0;
+ }
+ }
if (attr->userns_fd > INT_MAX)
return -EINVAL;
@@ -4862,22 +4877,8 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
}
static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
- struct mount_kattr *kattr, unsigned int flags)
+ struct mount_kattr *kattr)
{
- unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
-
- if (flags & AT_NO_AUTOMOUNT)
- lookup_flags &= ~LOOKUP_AUTOMOUNT;
- if (flags & AT_SYMLINK_NOFOLLOW)
- lookup_flags &= ~LOOKUP_FOLLOW;
- if (flags & AT_EMPTY_PATH)
- lookup_flags |= LOOKUP_EMPTY;
-
- *kattr = (struct mount_kattr) {
- .lookup_flags = lookup_flags,
- .recurse = !!(flags & AT_RECURSIVE),
- };
-
if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
return -EINVAL;
if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
@@ -4925,35 +4926,28 @@ static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
return -EINVAL;
}
- return build_mount_idmapped(attr, usize, kattr, flags);
+ return build_mount_idmapped(attr, usize, kattr);
}
static void finish_mount_kattr(struct mount_kattr *kattr)
{
- put_user_ns(kattr->mnt_userns);
- kattr->mnt_userns = NULL;
+ if (kattr->mnt_userns) {
+ put_user_ns(kattr->mnt_userns);
+ kattr->mnt_userns = NULL;
+ }
if (kattr->mnt_idmap)
mnt_idmap_put(kattr->mnt_idmap);
}
-SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
- unsigned int, flags, struct mount_attr __user *, uattr,
- size_t, usize)
+static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize,
+ struct mount_kattr *kattr)
{
- int err;
- struct path target;
+ int ret;
struct mount_attr attr;
- struct mount_kattr kattr;
BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
- if (flags & ~(AT_EMPTY_PATH |
- AT_RECURSIVE |
- AT_SYMLINK_NOFOLLOW |
- AT_NO_AUTOMOUNT))
- return -EINVAL;
-
if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
@@ -4962,9 +4956,9 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
if (!may_mount())
return -EPERM;
- err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
- if (err)
- return err;
+ ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
+ if (ret)
+ return ret;
/* Don't bother walking through the mounts if this is a nop. */
if (attr.attr_set == 0 &&
@@ -4972,7 +4966,39 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
attr.propagation == 0)
return 0;
- err = build_mount_kattr(&attr, usize, &kattr, flags);
+ return build_mount_kattr(&attr, usize, kattr);
+}
+
+SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
+ unsigned int, flags, struct mount_attr __user *, uattr,
+ size_t, usize)
+{
+ int err;
+ struct path target;
+ struct mount_kattr kattr;
+ unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
+
+ if (flags & ~(AT_EMPTY_PATH |
+ AT_RECURSIVE |
+ AT_SYMLINK_NOFOLLOW |
+ AT_NO_AUTOMOUNT))
+ return -EINVAL;
+
+ if (flags & AT_NO_AUTOMOUNT)
+ lookup_flags &= ~LOOKUP_AUTOMOUNT;
+ if (flags & AT_SYMLINK_NOFOLLOW)
+ lookup_flags &= ~LOOKUP_FOLLOW;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+ kattr = (struct mount_kattr) {
+ .lookup_flags = lookup_flags,
+ };
+
+ if (flags & AT_RECURSIVE)
+ kattr.kflags |= MOUNT_KATTR_RECURSE;
+
+ err = copy_mount_setattr(uattr, usize, &kattr);
if (err)
return err;
@@ -4985,6 +5011,47 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
return err;
}
+SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
+ unsigned, flags, struct mount_attr __user *, uattr,
+ size_t, usize)
+{
+ struct file __free(fput) *file = NULL;
+ int fd;
+
+ if (!uattr && usize)
+ return -EINVAL;
+
+ file = vfs_open_tree(dfd, filename, flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ if (uattr) {
+ int ret;
+ struct mount_kattr kattr = {};
+
+ kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
+ if (flags & AT_RECURSIVE)
+ kattr.kflags |= MOUNT_KATTR_RECURSE;
+
+ ret = copy_mount_setattr(uattr, usize, &kattr);
+ if (ret)
+ return ret;
+
+ ret = do_mount_setattr(&file->f_path, &kattr);
+ if (ret)
+ return ret;
+
+ finish_mount_kattr(&kattr);
+ }
+
+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ fd_install(fd, no_free_ptr(file));
+ return fd;
+}
+
int show_path(struct seq_file *m, struct dentry *root)
{
if (root->d_sb->s_op->show_path)
@@ -5459,6 +5526,21 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
return 0;
}
+/* This must be updated whenever a new flag is added */
+#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \
+ STATMOUNT_MNT_BASIC | \
+ STATMOUNT_PROPAGATE_FROM | \
+ STATMOUNT_MNT_ROOT | \
+ STATMOUNT_MNT_POINT | \
+ STATMOUNT_FS_TYPE | \
+ STATMOUNT_MNT_NS_ID | \
+ STATMOUNT_MNT_OPTS | \
+ STATMOUNT_FS_SUBTYPE | \
+ STATMOUNT_SB_SOURCE | \
+ STATMOUNT_OPT_ARRAY | \
+ STATMOUNT_OPT_SEC_ARRAY | \
+ STATMOUNT_SUPPORTED_MASK)
+
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
struct mnt_namespace *ns)
{
@@ -5535,9 +5617,17 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (!err && s->mask & STATMOUNT_MNT_NS_ID)
statmount_mnt_ns_id(s, ns);
+ if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) {
+ s->sm.mask |= STATMOUNT_SUPPORTED_MASK;
+ s->sm.supported_mask = STATMOUNT_SUPPORTED;
+ }
+
if (err)
return err;
+ /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */
+ WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask);
+
return 0;
}