summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-09-29 11:20:29 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-09-29 11:20:29 -0700
commit18b19abc3709b109676ffd1f48dcd332c2e477d4 (patch)
treed0d51d0a3d6f99e6082b42cf6d1ca710a46b6b49
parent5484a4ea7a1f208b886b58dd55cc55f418930f8a (diff)
parent6e65f4e8fc5b02f7a60ebb5b1b83772df0b86663 (diff)
Merge tag 'namespace-6.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull namespace updates from Christian Brauner: "This contains a larger set of changes around the generic namespace infrastructure of the kernel. Each specific namespace type (net, cgroup, mnt, ...) embedds a struct ns_common which carries the reference count of the namespace and so on. We open-coded and cargo-culted so many quirks for each namespace type that it just wasn't scalable anymore. So given there's a bunch of new changes coming in that area I've started cleaning all of this up. The core change is to make it possible to correctly initialize every namespace uniformly and derive the correct initialization settings from the type of the namespace such as namespace operations, namespace type and so on. This leaves the new ns_common_init() function with a single parameter which is the specific namespace type which derives the correct parameters statically. This also means the compiler will yell as soon as someone does something remotely fishy. The ns_common_init() addition also allows us to remove ns_alloc_inum() and drops any special-casing of the initial network namespace in the network namespace initialization code that Linus complained about. Another part is reworking the reference counting. The reference counting was open-coded and copy-pasted for each namespace type even though they all followed the same rules. This also removes all open accesses to the reference count and makes it private and only uses a very small set of dedicated helpers to manipulate them just like we do for e.g., files. In addition this generalizes the mount namespace iteration infrastructure introduced a few cycles ago. As reminder, the vfs makes it possible to iterate sequentially and bidirectionally through all mount namespaces on the system or all mount namespaces that the caller holds privilege over. This allow userspace to iterate over all mounts in all mount namespaces using the listmount() and statmount() system call. Each mount namespace has a unique identifier for the lifetime of the systems that is exposed to userspace. The network namespace also has a unique identifier working exactly the same way. This extends the concept to all other namespace types. The new nstree type makes it possible to lookup namespaces purely by their identifier and to walk the namespace list sequentially and bidirectionally for all namespace types, allowing userspace to iterate through all namespaces. Looking up namespaces in the namespace tree works completely locklessly. This also means we can move the mount namespace onto the generic infrastructure and remove a bunch of code and members from struct mnt_namespace itself. There's a bunch of stuff coming on top of this in the future but for now this uses the generic namespace tree to extend a concept introduced first for pidfs a few cycles ago. For a while now we have supported pidfs file handles for pidfds. This has proven to be very useful. This extends the concept to cover namespaces as well. It is possible to encode and decode namespace file handles using the common name_to_handle_at() and open_by_handle_at() apis. As with pidfs file handles, namespace file handles are exhaustive, meaning it is not required to actually hold a reference to nsfs in able to decode aka open_by_handle_at() a namespace file handle. Instead the FD_NSFS_ROOT constant can be passed which will let the kernel grab a reference to the root of nsfs internally and thus decode the file handle. Namespaces file descriptors can already be derived from pidfds which means they aren't subject to overmount protection bugs. IOW, it's irrelevant if the caller would not have access to an appropriate /proc/<pid>/ns/ directory as they could always just derive the namespace based on a pidfd already. It has the same advantage as pidfds. It's possible to reliably and for the lifetime of the system refer to a namespace without pinning any resources and to compare them trivially. Permission checking is kept simple. If the caller is located in the namespace the file handle refers to they are able to open it otherwise they must hold privilege over the owning namespace of the relevant namespace. The namespace file handle layout is exposed as uapi and has a stable and extensible format. For now it simply contains the namespace identifier, the namespace type, and the inode number. The stable format means that userspace may construct its own namespace file handles without going through name_to_handle_at() as they are already allowed for pidfs and cgroup file handles" * tag 'namespace-6.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (65 commits) ns: drop assert ns: move ns type into struct ns_common nstree: make struct ns_tree private ns: add ns_debug() ns: simplify ns_common_init() further cgroup: add missing ns_common include ns: use inode initializer for initial namespaces selftests/namespaces: verify initial namespace inode numbers ns: rename to __ns_ref nsfs: port to ns_ref_*() helpers net: port to ns_ref_*() helpers uts: port to ns_ref_*() helpers ipv4: use check_net() net: use check_net() net-sysfs: use check_net() user: port to ns_ref_*() helpers time: port to ns_ref_*() helpers pid: port to ns_ref_*() helpers ipc: port to ns_ref_*() helpers cgroup: port to ns_ref_*() helpers ...
-rw-r--r--block/blk-integrity.c8
-rw-r--r--fs/fhandle.c6
-rw-r--r--fs/internal.h1
-rw-r--r--fs/mount.h12
-rw-r--r--fs/namespace.c196
-rw-r--r--fs/nsfs.c211
-rw-r--r--fs/pidfs.c2
-rw-r--r--fs/proc/root.c2
-rw-r--r--include/linux/cgroup.h47
-rw-r--r--include/linux/cgroup_namespace.h58
-rw-r--r--include/linux/exportfs.h6
-rw-r--r--include/linux/fs.h14
-rw-r--r--include/linux/ipc_namespace.h9
-rw-r--r--include/linux/mnt_namespace.h2
-rw-r--r--include/linux/ns_common.h139
-rw-r--r--include/linux/nsfs.h40
-rw-r--r--include/linux/nsproxy.h11
-rw-r--r--include/linux/nstree.h78
-rw-r--r--include/linux/pid_namespace.h7
-rw-r--r--include/linux/proc_ns.h22
-rw-r--r--include/linux/time_namespace.h13
-rw-r--r--include/linux/user_namespace.h9
-rw-r--r--include/linux/uts_namespace.h65
-rw-r--r--include/linux/utsname.h53
-rw-r--r--include/net/net_namespace.h13
-rw-r--r--include/uapi/linux/fcntl.h1
-rw-r--r--include/uapi/linux/nsfs.h18
-rw-r--r--init/main.c2
-rw-r--r--init/version-timestamp.c5
-rw-r--r--ipc/msgutil.c6
-rw-r--r--ipc/namespace.c19
-rw-r--r--ipc/shm.c2
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/cgroup/cgroup.c7
-rw-r--r--kernel/cgroup/namespace.c27
-rw-r--r--kernel/nscommon.c77
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/nstree.c247
-rw-r--r--kernel/pid.c5
-rw-r--r--kernel/pid_namespace.c23
-rw-r--r--kernel/time/namespace.c32
-rw-r--r--kernel/user.c5
-rw-r--r--kernel/user_namespace.c24
-rw-r--r--kernel/utsname.c31
-rw-r--r--net/core/net-sysfs.c6
-rw-r--r--net/core/net_namespace.c58
-rw-r--r--net/ipv4/inet_timewait_sock.c4
-rw-r--r--net/ipv4/tcp_metrics.c2
-rw-r--r--tools/include/uapi/linux/nsfs.h17
-rw-r--r--tools/testing/selftests/namespaces/.gitignore3
-rw-r--r--tools/testing/selftests/namespaces/Makefile7
-rw-r--r--tools/testing/selftests/namespaces/config7
-rw-r--r--tools/testing/selftests/namespaces/file_handle_test.c1429
-rw-r--r--tools/testing/selftests/namespaces/init_ino_test.c61
-rw-r--r--tools/testing/selftests/namespaces/nsid_test.c986
55 files changed, 3678 insertions, 463 deletions
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 056b8948369d..ce08ad4565e2 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -58,16 +58,14 @@ new_segment:
int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
struct logical_block_metadata_cap __user *argp)
{
- struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk);
+ struct blk_integrity *bi;
struct logical_block_metadata_cap meta_cap = {};
size_t usize = _IOC_SIZE(cmd);
- if (_IOC_DIR(cmd) != _IOC_DIR(FS_IOC_GETLBMD_CAP) ||
- _IOC_TYPE(cmd) != _IOC_TYPE(FS_IOC_GETLBMD_CAP) ||
- _IOC_NR(cmd) != _IOC_NR(FS_IOC_GETLBMD_CAP) ||
- _IOC_SIZE(cmd) < LBMD_SIZE_VER0)
+ if (!extensible_ioctl_valid(cmd, FS_IOC_GETLBMD_CAP, LBMD_SIZE_VER0))
return -ENOIOCTLCMD;
+ bi = blk_get_integrity(bdev->bd_disk);
if (!bi)
goto out;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index a907ddfac4d5..052f9c9368fb 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -11,6 +11,7 @@
#include <linux/personality.h>
#include <linux/uaccess.h>
#include <linux/compat.h>
+#include <linux/nsfs.h>
#include "internal.h"
#include "mount.h"
@@ -189,6 +190,11 @@ static int get_path_anchor(int fd, struct path *root)
return 0;
}
+ if (fd == FD_NSFS_ROOT) {
+ nsfs_get_root(root);
+ return 0;
+ }
+
return -EBADF;
}
diff --git a/fs/internal.h b/fs/internal.h
index 38e8aab27bbd..a33d18ee5b74 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -355,3 +355,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr);
void pidfs_get_root(struct path *path);
+void nsfs_get_root(struct path *path);
diff --git a/fs/mount.h b/fs/mount.h
index 97737051a8b9..79c85639a7ba 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -17,11 +17,7 @@ struct mnt_namespace {
};
struct user_namespace *user_ns;
struct ucounts *ucounts;
- u64 seq; /* Sequence number to prevent loops */
- union {
- wait_queue_head_t poll;
- struct rcu_head mnt_ns_rcu;
- };
+ wait_queue_head_t poll;
u64 seq_origin; /* Sequence number of origin mount namespace */
u64 event;
#ifdef CONFIG_FSNOTIFY
@@ -30,8 +26,6 @@ struct mnt_namespace {
#endif
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
- struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
- struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
refcount_t passive; /* number references not pinning @mounts */
} __randomize_layout;
@@ -149,7 +143,7 @@ static inline void detach_mounts(struct dentry *dentry)
static inline void get_mnt_ns(struct mnt_namespace *ns)
{
- refcount_inc(&ns->ns.count);
+ ns_ref_inc(ns);
}
extern seqlock_t mount_lock;
@@ -173,7 +167,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry)
static inline bool is_anon_ns(struct mnt_namespace *ns)
{
- return ns->seq == 0;
+ return ns->ns.ns_id == 0;
}
static inline bool anon_ns_root(const struct mount *m)
diff --git a/fs/namespace.c b/fs/namespace.c
index 18bd27d8c9ab..dc01b14c58cd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -33,6 +33,7 @@
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/pidfs.h>
+#include <linux/nstree.h>
#include "pnode.h"
#include "internal.h"
@@ -89,13 +90,10 @@ static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */
-static DEFINE_SEQLOCK(mnt_ns_tree_lock);
#ifdef CONFIG_FSNOTIFY
LIST_HEAD(notify_list); /* protected by namespace_sem */
#endif
-static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
-static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
enum mount_kattr_flags_t {
MOUNT_KATTR_RECURSE = (1 << 0),
@@ -128,53 +126,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{
+ struct ns_common *ns;
+
if (!node)
return NULL;
- return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
-}
-
-static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
-{
- struct mnt_namespace *ns_a = node_to_mnt_ns(a);
- struct mnt_namespace *ns_b = node_to_mnt_ns(b);
- u64 seq_a = ns_a->seq;
- u64 seq_b = ns_b->seq;
-
- if (seq_a < seq_b)
- return -1;
- if (seq_a > seq_b)
- return 1;
- return 0;
-}
-
-static inline void mnt_ns_tree_write_lock(void)
-{
- write_seqlock(&mnt_ns_tree_lock);
-}
-
-static inline void mnt_ns_tree_write_unlock(void)
-{
- write_sequnlock(&mnt_ns_tree_lock);
-}
-
-static void mnt_ns_tree_add(struct mnt_namespace *ns)
-{
- struct rb_node *node, *prev;
-
- mnt_ns_tree_write_lock();
- node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
- /*
- * If there's no previous entry simply add it after the
- * head and if there is add it after the previous entry.
- */
- prev = rb_prev(&ns->mnt_ns_tree_node);
- if (!prev)
- list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
- else
- list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
- mnt_ns_tree_write_unlock();
-
- WARN_ON_ONCE(node);
+ ns = rb_entry(node, struct ns_common, ns_tree_node);
+ return container_of(ns, struct mnt_namespace, ns);
}
static void mnt_ns_release(struct mnt_namespace *ns)
@@ -190,32 +147,16 @@ DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
static void mnt_ns_release_rcu(struct rcu_head *rcu)
{
- mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
+ mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu));
}
static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{
/* remove from global mount namespace list */
- if (!is_anon_ns(ns)) {
- mnt_ns_tree_write_lock();
- rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
- list_bidir_del_rcu(&ns->mnt_ns_list);
- mnt_ns_tree_write_unlock();
- }
+ if (ns_tree_active(ns))
+ ns_tree_remove(ns);
- call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
-}
-
-static int mnt_ns_find(const void *key, const struct rb_node *node)
-{
- const u64 mnt_ns_id = *(u64 *)key;
- const struct mnt_namespace *ns = node_to_mnt_ns(node);
-
- if (mnt_ns_id < ns->seq)
- return -1;
- if (mnt_ns_id > ns->seq)
- return 1;
- return 0;
+ call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu);
}
/*
@@ -234,28 +175,21 @@ static int mnt_ns_find(const void *key, const struct rb_node *node)
*/
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{
- struct mnt_namespace *ns;
- struct rb_node *node;
- unsigned int seq;
+ struct mnt_namespace *mnt_ns;
+ struct ns_common *ns;
guard(rcu)();
- do {
- seq = read_seqbegin(&mnt_ns_tree_lock);
- node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
- if (node)
- break;
- } while (read_seqretry(&mnt_ns_tree_lock, seq));
-
- if (!node)
+ ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS);
+ if (!ns)
return NULL;
/*
* The last reference count is put with RCU delay so we can
* unconditonally acquire a reference here.
*/
- ns = node_to_mnt_ns(node);
- refcount_inc(&ns->passive);
- return ns;
+ mnt_ns = container_of(ns, struct mnt_namespace, ns);
+ refcount_inc(&mnt_ns->passive);
+ return mnt_ns;
}
static inline void lock_mount_hash(void)
@@ -1026,7 +960,7 @@ static inline bool check_anonymous_mnt(struct mount *mnt)
return false;
seq = mnt->mnt_ns->seq_origin;
- return !seq || (seq == current->nsproxy->mnt_ns->seq);
+ return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id);
}
/*
@@ -2161,19 +2095,16 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
+ struct ns_common *ns;
+
guard(rcu)();
for (;;) {
- struct list_head *list;
-
- if (previous)
- list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
- else
- list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
- if (list_is_head(list, &mnt_ns_list))
- return ERR_PTR(-ENOENT);
+ ns = ns_tree_adjoined_rcu(mntns, previous);
+ if (IS_ERR(ns))
+ return ERR_CAST(ns);
- mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
+ mntns = to_mnt_ns(ns);
/*
* The last passive reference count is put with RCU
@@ -2188,7 +2119,7 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr
* the mount namespace and it might already be on its
* deathbed.
*/
- if (!refcount_inc_not_zero(&mntns->ns.count))
+ if (!ns_ref_get(mntns))
continue;
return mntns;
@@ -2213,7 +2144,7 @@ static bool mnt_ns_loop(struct dentry *dentry)
if (!mnt_ns)
return false;
- return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
+ return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id;
}
struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
@@ -3089,7 +3020,7 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
if (is_anon_ns(src_mnt_ns))
ns->seq_origin = src_mnt_ns->seq_origin;
else
- ns->seq_origin = src_mnt_ns->seq;
+ ns->seq_origin = src_mnt_ns->ns.ns_id;
}
mnt = __do_loopback(path, recursive);
@@ -4162,20 +4093,11 @@ static void dec_mnt_namespaces(struct ucounts *ucounts)
static void free_mnt_ns(struct mnt_namespace *ns)
{
if (!is_anon_ns(ns))
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
dec_mnt_namespaces(ns->ucounts);
mnt_ns_tree_remove(ns);
}
-/*
- * Assign a sequence number so we can detect when we attempt to bind
- * mount a reference to an older mount namespace into the current
- * mount namespace, preventing reference counting loops. A 64bit
- * number incrementing at 10Ghz will take 12,427 years to wrap which
- * is effectively never, so we can ignore the possibility.
- */
-static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
-
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{
struct mnt_namespace *new_ns;
@@ -4191,22 +4113,20 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
dec_mnt_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
}
- if (!anon) {
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
- dec_mnt_namespaces(ucounts);
- return ERR_PTR(ret);
- }
+
+ if (anon)
+ ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO);
+ else
+ ret = ns_common_init(new_ns);
+ if (ret) {
+ kfree(new_ns);
+ dec_mnt_namespaces(ucounts);
+ return ERR_PTR(ret);
}
- new_ns->ns.ops = &mntns_operations;
if (!anon)
- new_ns->seq = atomic64_inc_return(&mnt_ns_seq);
- refcount_set(&new_ns->ns.count, 1);
+ ns_tree_gen_id(&new_ns->ns);
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
- INIT_LIST_HEAD(&new_ns->mnt_ns_list);
- RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
init_waitqueue_head(&new_ns->poll);
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
@@ -4245,7 +4165,7 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
namespace_unlock();
- ns_free_inum(&new_ns->ns);
+ ns_common_free(ns);
dec_mnt_namespaces(new_ns->ucounts);
mnt_ns_release(new_ns);
return ERR_CAST(new);
@@ -4292,7 +4212,7 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
if (pwdmnt)
mntput(pwdmnt);
- mnt_ns_tree_add(new_ns);
+ ns_tree_add_raw(new_ns);
return new_ns;
}
@@ -5018,7 +4938,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
return -EINVAL;
ns = get_proc_ns(file_inode(fd_file(f)));
- if (ns->ops->type != CLONE_NEWUSER)
+ if (ns->ns_type != CLONE_NEWUSER)
return -EINVAL;
/*
@@ -5411,7 +5331,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
{
s->sm.mask |= STATMOUNT_MNT_NS_ID;
- s->sm.mnt_ns_id = ns->seq;
+ s->sm.mnt_ns_id = ns->ns.ns_id;
}
static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
@@ -5918,7 +5838,7 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq
return ERR_PTR(-EINVAL);
ns = get_proc_ns(file_inode(fd_file(f)));
- if (ns->ops->type != CLONE_NEWNS)
+ if (ns->ns_type != CLONE_NEWNS)
return ERR_PTR(-EINVAL);
mnt_ns = to_mnt_ns(ns);
@@ -6131,28 +6051,33 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
return ret;
}
+struct mnt_namespace init_mnt_ns = {
+ .ns.inum = ns_init_inum(&init_mnt_ns),
+ .ns.ops = &mntns_operations,
+ .user_ns = &init_user_ns,
+ .ns.__ns_ref = REFCOUNT_INIT(1),
+ .ns.ns_type = ns_common_type(&init_mnt_ns),
+ .passive = REFCOUNT_INIT(1),
+ .mounts = RB_ROOT,
+ .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll),
+};
+
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
struct mount *m;
- struct mnt_namespace *ns;
struct path root;
mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
if (IS_ERR(mnt))
panic("Can't create rootfs");
- ns = alloc_mnt_ns(&init_user_ns, true);
- if (IS_ERR(ns))
- panic("Can't allocate initial namespace");
- ns->seq = atomic64_inc_return(&mnt_ns_seq);
- ns->ns.inum = PROC_MNT_INIT_INO;
m = real_mount(mnt);
- ns->root = m;
- ns->nr_mounts = 1;
- mnt_add_to_ns(ns, m);
- init_task.nsproxy->mnt_ns = ns;
- get_mnt_ns(ns);
+ init_mnt_ns.root = m;
+ init_mnt_ns.nr_mounts = 1;
+ mnt_add_to_ns(&init_mnt_ns, m);
+ init_task.nsproxy->mnt_ns = &init_mnt_ns;
+ get_mnt_ns(&init_mnt_ns);
root.mnt = mnt;
root.dentry = mnt->mnt_root;
@@ -6160,7 +6085,7 @@ static void __init init_mount_tree(void)
set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);
- mnt_ns_tree_add(ns);
+ ns_tree_add(&init_mnt_ns);
}
void __init mnt_init(void)
@@ -6200,7 +6125,7 @@ void __init mnt_init(void)
void put_mnt_ns(struct mnt_namespace *ns)
{
- if (!refcount_dec_and_test(&ns->ns.count))
+ if (!ns_ref_put(ns))
return;
namespace_lock();
emptied_ns = ns;
@@ -6449,7 +6374,6 @@ static struct user_namespace *mntns_owner(struct ns_common *ns)
const struct proc_ns_operations mntns_operations = {
.name = "mnt",
- .type = CLONE_NEWNS,
.get = mntns_get,
.put = mntns_put,
.install = mntns_install,
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 59aa801347a7..e7fd8a790aaa 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -13,12 +13,26 @@
#include <linux/nsfs.h>
#include <linux/uaccess.h>
#include <linux/mnt_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/time_namespace.h>
+#include <linux/utsname.h>
+#include <linux/exportfs.h>
+#include <linux/nstree.h>
+#include <net/net_namespace.h>
#include "mount.h"
#include "internal.h"
static struct vfsmount *nsfs_mnt;
+static struct path nsfs_root_path = {};
+
+void nsfs_get_root(struct path *path)
+{
+ *path = nsfs_root_path;
+ path_get(path);
+}
+
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg);
static const struct file_operations ns_file_operations = {
@@ -139,7 +153,7 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns,
* the size value will be set to the size the kernel knows about.
*/
kinfo->size = min(usize, sizeof(*kinfo));
- kinfo->mnt_ns_id = mnt_ns->seq;
+ kinfo->mnt_ns_id = mnt_ns->ns.ns_id;
kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts);
/* Subtract the root mount of the mount namespace. */
if (kinfo->nr_mounts)
@@ -163,15 +177,18 @@ static bool nsfs_ioctl_valid(unsigned int cmd)
case NS_GET_TGID_FROM_PIDNS:
case NS_GET_PID_IN_PIDNS:
case NS_GET_TGID_IN_PIDNS:
- return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+ case NS_GET_ID:
+ return true;
}
/* Extensible ioctls require some extra handling. */
switch (_IOC_NR(cmd)) {
case _IOC_NR(NS_MNT_GET_INFO):
+ return extensible_ioctl_valid(cmd, NS_MNT_GET_INFO, MNT_NS_INFO_SIZE_VER0);
case _IOC_NR(NS_MNT_GET_NEXT):
+ return extensible_ioctl_valid(cmd, NS_MNT_GET_NEXT, MNT_NS_INFO_SIZE_VER0);
case _IOC_NR(NS_MNT_GET_PREV):
- return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+ return extensible_ioctl_valid(cmd, NS_MNT_GET_PREV, MNT_NS_INFO_SIZE_VER0);
}
return false;
@@ -202,26 +219,14 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
return -EINVAL;
return open_related_ns(ns, ns->ops->get_parent);
case NS_GET_NSTYPE:
- return ns->ops->type;
+ return ns->ns_type;
case NS_GET_OWNER_UID:
- if (ns->ops->type != CLONE_NEWUSER)
+ if (ns->ns_type != CLONE_NEWUSER)
return -EINVAL;
user_ns = container_of(ns, struct user_namespace, ns);
argp = (uid_t __user *) arg;
uid = from_kuid_munged(current_user_ns(), user_ns->owner);
return put_user(uid, argp);
- case NS_GET_MNTNS_ID: {
- __u64 __user *idp;
- __u64 id;
-
- if (ns->ops->type != CLONE_NEWNS)
- return -EINVAL;
-
- mnt_ns = container_of(ns, struct mnt_namespace, ns);
- idp = (__u64 __user *)arg;
- id = mnt_ns->seq;
- return put_user(id, idp);
- }
case NS_GET_PID_FROM_PIDNS:
fallthrough;
case NS_GET_TGID_FROM_PIDNS:
@@ -229,7 +234,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
case NS_GET_PID_IN_PIDNS:
fallthrough;
case NS_GET_TGID_IN_PIDNS: {
- if (ns->ops->type != CLONE_NEWPID)
+ if (ns->ns_type != CLONE_NEWPID)
return -EINVAL;
ret = -ESRCH;
@@ -267,6 +272,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
ret = -ESRCH;
return ret;
}
+ case NS_GET_MNTNS_ID:
+ if (ns->ns_type != CLONE_NEWNS)
+ return -EINVAL;
+ fallthrough;
+ case NS_GET_ID: {
+ __u64 __user *idp;
+ __u64 id;
+
+ idp = (__u64 __user *)arg;
+ id = ns->ns_id;
+ return put_user(id, idp);
+ }
}
/* extensible ioctls */
@@ -276,7 +293,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
size_t usize = _IOC_SIZE(ioctl);
- if (ns->ops->type != CLONE_NEWNS)
+ if (ns->ns_type != CLONE_NEWNS)
return -EINVAL;
if (!uinfo)
@@ -297,7 +314,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
struct file *f __free(fput) = NULL;
size_t usize = _IOC_SIZE(ioctl);
- if (ns->ops->type != CLONE_NEWNS)
+ if (ns->ns_type != CLONE_NEWNS)
return -EINVAL;
if (usize < MNT_NS_INFO_SIZE_VER0)
@@ -415,12 +432,164 @@ static const struct stashed_operations nsfs_stashed_ops = {
.put_data = nsfs_put_data,
};
+#define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32))
+#define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32))
+
+static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
+{
+ struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh;
+ struct ns_common *ns = inode->i_private;
+ int len = *max_len;
+
+ if (parent)
+ return FILEID_INVALID;
+
+ if (len < NSFS_FID_SIZE_U32_VER0) {
+ *max_len = NSFS_FID_SIZE_U32_LATEST;
+ return FILEID_INVALID;
+ } else if (len > NSFS_FID_SIZE_U32_LATEST) {
+ *max_len = NSFS_FID_SIZE_U32_LATEST;
+ }
+
+ fid->ns_id = ns->ns_id;
+ fid->ns_type = ns->ns_type;
+ fid->ns_inum = inode->i_ino;
+ return FILEID_NSFS;
+}
+
+static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ struct path path __free(path_put) = {};
+ struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh;
+ struct user_namespace *owning_ns = NULL;
+ struct ns_common *ns;
+ int ret;
+
+ if (fh_len < NSFS_FID_SIZE_U32_VER0)
+ return NULL;
+
+ /* Check that any trailing bytes are zero. */
+ if ((fh_len > NSFS_FID_SIZE_U32_LATEST) &&
+ memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0,
+ fh_len - NSFS_FID_SIZE_U32_LATEST))
+ return NULL;
+
+ switch (fh_type) {
+ case FILEID_NSFS:
+ break;
+ default:
+ return NULL;
+ }
+
+ scoped_guard(rcu) {
+ ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type);
+ if (!ns)
+ return NULL;
+
+ VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id);
+ VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type);
+ VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum);
+
+ if (!__ns_ref_get(ns))
+ return NULL;
+ }
+
+ switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+ case CLONE_NEWCGROUP:
+ if (!current_in_namespace(to_cg_ns(ns)))
+ owning_ns = to_cg_ns(ns)->user_ns;
+ break;
+#endif
+#ifdef CONFIG_IPC_NS
+ case CLONE_NEWIPC:
+ if (!current_in_namespace(to_ipc_ns(ns)))
+ owning_ns = to_ipc_ns(ns)->user_ns;
+ break;
+#endif
+ case CLONE_NEWNS:
+ if (!current_in_namespace(to_mnt_ns(ns)))
+ owning_ns = to_mnt_ns(ns)->user_ns;
+ break;
+#ifdef CONFIG_NET_NS
+ case CLONE_NEWNET:
+ if (!current_in_namespace(to_net_ns(ns)))
+ owning_ns = to_net_ns(ns)->user_ns;
+ break;
+#endif
+#ifdef CONFIG_PID_NS
+ case CLONE_NEWPID:
+ if (!current_in_namespace(to_pid_ns(ns))) {
+ owning_ns = to_pid_ns(ns)->user_ns;
+ } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) {
+ ns->ops->put(ns);
+ return ERR_PTR(-EPERM);
+ }
+ break;
+#endif
+#ifdef CONFIG_TIME_NS
+ case CLONE_NEWTIME:
+ if (!current_in_namespace(to_time_ns(ns)))
+ owning_ns = to_time_ns(ns)->user_ns;
+ break;
+#endif
+#ifdef CONFIG_USER_NS
+ case CLONE_NEWUSER:
+ if (!current_in_namespace(to_user_ns(ns)))
+ owning_ns = to_user_ns(ns);
+ break;
+#endif
+#ifdef CONFIG_UTS_NS
+ case CLONE_NEWUTS:
+ if (!current_in_namespace(to_uts_ns(ns)))
+ owning_ns = to_uts_ns(ns)->user_ns;
+ break;
+#endif
+ default:
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) {
+ ns->ops->put(ns);
+ return ERR_PTR(-EPERM);
+ }
+
+ /* path_from_stashed() unconditionally consumes the reference. */
+ ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return no_free_ptr(path.dentry);
+}
+
+static int nsfs_export_permission(struct handle_to_path_ctx *ctx,
+ unsigned int oflags)
+{
+ /* nsfs_fh_to_dentry() performs all permission checks. */
+ return 0;
+}
+
+static struct file *nsfs_export_open(struct path *path, unsigned int oflags)
+{
+ return file_open_root(path, "", oflags, 0);
+}
+
+static const struct export_operations nsfs_export_operations = {
+ .encode_fh = nsfs_encode_fh,
+ .fh_to_dentry = nsfs_fh_to_dentry,
+ .open = nsfs_export_open,
+ .permission = nsfs_export_permission,
+};
+
static int nsfs_init_fs_context(struct fs_context *fc)
{
struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
if (!ctx)
return -ENOMEM;
ctx->ops = &nsfs_ops;
+ ctx->eops = &nsfs_export_operations;
ctx->dops = &ns_dentry_operations;
fc->s_fs_info = (void *)&nsfs_stashed_ops;
return 0;
@@ -438,4 +607,6 @@ void __init nsfs_init(void)
if (IS_ERR(nsfs_mnt))
panic("can't set nsfs up\n");
nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
+ nsfs_root_path.mnt = nsfs_mnt;
+ nsfs_root_path.dentry = nsfs_mnt->mnt_root;
}
diff --git a/fs/pidfs.c b/fs/pidfs.c
index d01729c5263a..c40c29c702e5 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -440,7 +440,7 @@ static bool pidfs_ioctl_valid(unsigned int cmd)
* erronously mistook the file descriptor for a pidfd.
* This is not perfect but will catch most cases.
*/
- return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO));
+ return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0);
}
return false;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index fd1f1c8a939a..1e24e085c7d5 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -143,7 +143,7 @@ static int proc_parse_pidns_param(struct fs_context *fc,
if (!proc_ns_file(ns_filp))
return invalfc(fc, "pidns argument is not an nsfs file");
ns = get_proc_ns(file_inode(ns_filp));
- if (ns->ops->type != CLONE_NEWPID)
+ if (ns->ns_type != CLONE_NEWPID)
return invalfc(fc, "pidns argument is not a pidns file");
target = container_of(ns, struct pid_namespace, ns);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 56d9556a181a..bab98357960d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -27,6 +27,7 @@
#include <linux/kernel_stat.h>
#include <linux/cgroup-defs.h>
+#include <linux/cgroup_namespace.h>
struct kernel_clone_args;
@@ -783,52 +784,6 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
#endif /* CONFIG_CGROUP_DATA */
-struct cgroup_namespace {
- struct ns_common ns;
- struct user_namespace *user_ns;
- struct ucounts *ucounts;
- struct css_set *root_cset;
-};
-
-extern struct cgroup_namespace init_cgroup_ns;
-
-#ifdef CONFIG_CGROUPS
-
-void free_cgroup_ns(struct cgroup_namespace *ns);
-
-struct cgroup_namespace *copy_cgroup_ns(u64 flags,
- struct user_namespace *user_ns,
- struct cgroup_namespace *old_ns);
-
-int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
- struct cgroup_namespace *ns);
-
-static inline void get_cgroup_ns(struct cgroup_namespace *ns)
-{
- refcount_inc(&ns->ns.count);
-}
-
-static inline void put_cgroup_ns(struct cgroup_namespace *ns)
-{
- if (refcount_dec_and_test(&ns->ns.count))
- free_cgroup_ns(ns);
-}
-
-#else /* !CONFIG_CGROUPS */
-
-static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
-static inline struct cgroup_namespace *
-copy_cgroup_ns(u64 flags, struct user_namespace *user_ns,
- struct cgroup_namespace *old_ns)
-{
- return old_ns;
-}
-
-static inline void get_cgroup_ns(struct cgroup_namespace *ns) { }
-static inline void put_cgroup_ns(struct cgroup_namespace *ns) { }
-
-#endif /* !CONFIG_CGROUPS */
-
#ifdef CONFIG_CGROUPS
void cgroup_enter_frozen(void);
diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h
new file mode 100644
index 000000000000..78a8418558a4
--- /dev/null
+++ b/include/linux/cgroup_namespace.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CGROUP_NAMESPACE_H
+#define _LINUX_CGROUP_NAMESPACE_H
+
+#include <linux/ns_common.h>
+
+struct cgroup_namespace {
+ struct ns_common ns;
+ struct user_namespace *user_ns;
+ struct ucounts *ucounts;
+ struct css_set *root_cset;
+};
+
+extern struct cgroup_namespace init_cgroup_ns;
+
+#ifdef CONFIG_CGROUPS
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct cgroup_namespace, ns);
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns);
+
+struct cgroup_namespace *copy_cgroup_ns(u64 flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns);
+
+int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns);
+
+static inline void get_cgroup_ns(struct cgroup_namespace *ns)
+{
+ ns_ref_inc(ns);
+}
+
+static inline void put_cgroup_ns(struct cgroup_namespace *ns)
+{
+ if (ns_ref_put(ns))
+ free_cgroup_ns(ns);
+}
+
+#else /* !CONFIG_CGROUPS */
+
+static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
+static inline struct cgroup_namespace *
+copy_cgroup_ns(u64 flags, struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns)
+{
+ return old_ns;
+}
+
+static inline void get_cgroup_ns(struct cgroup_namespace *ns) { }
+static inline void put_cgroup_ns(struct cgroup_namespace *ns) { }
+
+#endif /* !CONFIG_CGROUPS */
+
+#endif /* _LINUX_CGROUP_NAMESPACE_H */
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index cfb0dd1ea49c..3aac58a520c7 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -123,6 +123,12 @@ enum fid_type {
FILEID_BCACHEFS_WITH_PARENT = 0xb2,
/*
+ *
+ * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number.
+ */
+ FILEID_NSFS = 0xf1,
+
+ /*
* 64 bit unique kernfs id
*/
FILEID_KERNFS = 0xfe,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a854f2910cd9..5540836f674b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -4018,4 +4018,18 @@ static inline bool vfs_empty_path(int dfd, const char __user *path)
int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);
+static inline bool extensible_ioctl_valid(unsigned int cmd_a,
+ unsigned int cmd_b, size_t min_size)
+{
+ if (_IOC_DIR(cmd_a) != _IOC_DIR(cmd_b))
+ return false;
+ if (_IOC_TYPE(cmd_a) != _IOC_TYPE(cmd_b))
+ return false;
+ if (_IOC_NR(cmd_a) != _IOC_NR(cmd_b))
+ return false;
+ if (_IOC_SIZE(cmd_a) < min_size)
+ return false;
+ return true;
+}
+
#endif /* _LINUX_FS_H */
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 4b399893e2b3..12faca29bbb9 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -129,20 +129,25 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
#endif
#if defined(CONFIG_IPC_NS)
+static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct ipc_namespace, ns);
+}
+
extern struct ipc_namespace *copy_ipcs(u64 flags,
struct user_namespace *user_ns, struct ipc_namespace *ns);
static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
{
if (ns)
- refcount_inc(&ns->ns.count);
+ ns_ref_inc(ns);
return ns;
}
static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
{
if (ns) {
- if (refcount_inc_not_zero(&ns->ns.count))
+ if (ns_ref_get(ns))
return ns;
}
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index ff290c87b2e7..0acd1089d149 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -11,6 +11,8 @@ struct fs_struct;
struct user_namespace;
struct ns_common;
+extern struct mnt_namespace init_mnt_ns;
+
extern struct mnt_namespace *copy_mnt_ns(u64, struct mnt_namespace *,
struct user_namespace *, struct fs_struct *);
extern void put_mnt_ns(struct mnt_namespace *ns);
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 7d22ea50b098..f5b68b8abb54 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -3,14 +3,151 @@
#define _LINUX_NS_COMMON_H
#include <linux/refcount.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/sched.h>
struct proc_ns_operations;
+struct cgroup_namespace;
+struct ipc_namespace;
+struct mnt_namespace;
+struct net;
+struct pid_namespace;
+struct time_namespace;
+struct user_namespace;
+struct uts_namespace;
+
+extern struct cgroup_namespace init_cgroup_ns;
+extern struct ipc_namespace init_ipc_ns;
+extern struct mnt_namespace init_mnt_ns;
+extern struct net init_net;
+extern struct pid_namespace init_pid_ns;
+extern struct time_namespace init_time_ns;
+extern struct user_namespace init_user_ns;
+extern struct uts_namespace init_uts_ns;
+
+extern const struct proc_ns_operations netns_operations;
+extern const struct proc_ns_operations utsns_operations;
+extern const struct proc_ns_operations ipcns_operations;
+extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations pidns_for_children_operations;
+extern const struct proc_ns_operations userns_operations;
+extern const struct proc_ns_operations mntns_operations;
+extern const struct proc_ns_operations cgroupns_operations;
+extern const struct proc_ns_operations timens_operations;
+extern const struct proc_ns_operations timens_for_children_operations;
+
struct ns_common {
+ u32 ns_type;
struct dentry *stashed;
const struct proc_ns_operations *ops;
unsigned int inum;
- refcount_t count;
+ refcount_t __ns_ref; /* do not use directly */
+ union {
+ struct {
+ u64 ns_id;
+ struct rb_node ns_tree_node;
+ struct list_head ns_list_node;
+ };
+ struct rcu_head ns_rcu;
+ };
};
+int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum);
+void __ns_common_free(struct ns_common *ns);
+
+#define to_ns_common(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: &(__ns)->ns, \
+ const struct cgroup_namespace *: &(__ns)->ns, \
+ struct ipc_namespace *: &(__ns)->ns, \
+ const struct ipc_namespace *: &(__ns)->ns, \
+ struct mnt_namespace *: &(__ns)->ns, \
+ const struct mnt_namespace *: &(__ns)->ns, \
+ struct net *: &(__ns)->ns, \
+ const struct net *: &(__ns)->ns, \
+ struct pid_namespace *: &(__ns)->ns, \
+ const struct pid_namespace *: &(__ns)->ns, \
+ struct time_namespace *: &(__ns)->ns, \
+ const struct time_namespace *: &(__ns)->ns, \
+ struct user_namespace *: &(__ns)->ns, \
+ const struct user_namespace *: &(__ns)->ns, \
+ struct uts_namespace *: &(__ns)->ns, \
+ const struct uts_namespace *: &(__ns)->ns)
+
+#define ns_init_inum(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
+ struct ipc_namespace *: IPC_NS_INIT_INO, \
+ struct mnt_namespace *: MNT_NS_INIT_INO, \
+ struct net *: NET_NS_INIT_INO, \
+ struct pid_namespace *: PID_NS_INIT_INO, \
+ struct time_namespace *: TIME_NS_INIT_INO, \
+ struct user_namespace *: USER_NS_INIT_INO, \
+ struct uts_namespace *: UTS_NS_INIT_INO)
+
+#define ns_init_ns(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: &init_cgroup_ns, \
+ struct ipc_namespace *: &init_ipc_ns, \
+ struct mnt_namespace *: &init_mnt_ns, \
+ struct net *: &init_net, \
+ struct pid_namespace *: &init_pid_ns, \
+ struct time_namespace *: &init_time_ns, \
+ struct user_namespace *: &init_user_ns, \
+ struct uts_namespace *: &init_uts_ns)
+
+#define to_ns_operations(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
+ struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \
+ struct mnt_namespace *: &mntns_operations, \
+ struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \
+ struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \
+ struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \
+ struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \
+ struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL))
+
+#define ns_common_type(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CLONE_NEWCGROUP, \
+ struct ipc_namespace *: CLONE_NEWIPC, \
+ struct mnt_namespace *: CLONE_NEWNS, \
+ struct net *: CLONE_NEWNET, \
+ struct pid_namespace *: CLONE_NEWPID, \
+ struct time_namespace *: CLONE_NEWTIME, \
+ struct user_namespace *: CLONE_NEWUSER, \
+ struct uts_namespace *: CLONE_NEWUTS)
+
+#define ns_common_init(__ns) \
+ __ns_common_init(to_ns_common(__ns), \
+ ns_common_type(__ns), \
+ to_ns_operations(__ns), \
+ (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0))
+
+#define ns_common_init_inum(__ns, __inum) \
+ __ns_common_init(to_ns_common(__ns), \
+ ns_common_type(__ns), \
+ to_ns_operations(__ns), \
+ __inum)
+
+#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns)))
+
+static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
+{
+ return refcount_dec_and_test(&ns->__ns_ref);
+}
+
+static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
+{
+ return refcount_inc_not_zero(&ns->__ns_ref);
+}
+
+#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref)
+#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref)
+#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns)))
+#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns)))
+#define ns_ref_put_and_lock(__ns, __lock) \
+ refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock))
+
#endif
diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h
new file mode 100644
index 000000000000..e5a5fa83d36b
--- /dev/null
+++ b/include/linux/nsfs.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
+
+#ifndef _LINUX_NSFS_H
+#define _LINUX_NSFS_H
+
+#include <linux/ns_common.h>
+#include <linux/cred.h>
+#include <linux/pid_namespace.h>
+
+struct path;
+struct task_struct;
+struct proc_ns_operations;
+
+int ns_get_path(struct path *path, struct task_struct *task,
+ const struct proc_ns_operations *ns_ops);
+typedef struct ns_common *ns_get_path_helper_t(void *);
+int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb,
+ void *private_data);
+
+bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino);
+
+int ns_get_name(char *buf, size_t size, struct task_struct *task,
+ const struct proc_ns_operations *ns_ops);
+void nsfs_init(void);
+
+#define __current_namespace_from_type(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: current->nsproxy->cgroup_ns, \
+ struct ipc_namespace *: current->nsproxy->ipc_ns, \
+ struct net *: current->nsproxy->net_ns, \
+ struct pid_namespace *: task_active_pid_ns(current), \
+ struct mnt_namespace *: current->nsproxy->mnt_ns, \
+ struct time_namespace *: current->nsproxy->time_ns, \
+ struct user_namespace *: current_user_ns(), \
+ struct uts_namespace *: current->nsproxy->uts_ns)
+
+#define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns)
+
+#endif /* _LINUX_NSFS_H */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 82533e899ff4..bd118a187dec 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -42,17 +42,6 @@ struct nsproxy {
};
extern struct nsproxy init_nsproxy;
-#define to_ns_common(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: &(__ns->ns), \
- struct ipc_namespace *: &(__ns->ns), \
- struct net *: &(__ns->ns), \
- struct pid_namespace *: &(__ns->ns), \
- struct mnt_namespace *: &(__ns->ns), \
- struct time_namespace *: &(__ns->ns), \
- struct user_namespace *: &(__ns->ns), \
- struct uts_namespace *: &(__ns->ns))
-
/*
* A structure to encompass all bits needed to install
* a partial or complete new set of namespaces.
diff --git a/include/linux/nstree.h b/include/linux/nstree.h
new file mode 100644
index 000000000000..8b8636690473
--- /dev/null
+++ b/include/linux/nstree.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NSTREE_H
+#define _LINUX_NSTREE_H
+
+#include <linux/ns_common.h>
+#include <linux/nsproxy.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/rculist.h>
+#include <linux/cookie.h>
+
+extern struct ns_tree cgroup_ns_tree;
+extern struct ns_tree ipc_ns_tree;
+extern struct ns_tree mnt_ns_tree;
+extern struct ns_tree net_ns_tree;
+extern struct ns_tree pid_ns_tree;
+extern struct ns_tree time_ns_tree;
+extern struct ns_tree user_ns_tree;
+extern struct ns_tree uts_ns_tree;
+
+#define to_ns_tree(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: &(cgroup_ns_tree), \
+ struct ipc_namespace *: &(ipc_ns_tree), \
+ struct net *: &(net_ns_tree), \
+ struct pid_namespace *: &(pid_ns_tree), \
+ struct mnt_namespace *: &(mnt_ns_tree), \
+ struct time_namespace *: &(time_ns_tree), \
+ struct user_namespace *: &(user_ns_tree), \
+ struct uts_namespace *: &(uts_ns_tree))
+
+u64 ns_tree_gen_id(struct ns_common *ns);
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree);
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree);
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type);
+struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
+ struct ns_tree *ns_tree,
+ bool previous);
+
+static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
+{
+ ns_tree_gen_id(ns);
+ __ns_tree_add_raw(ns, ns_tree);
+}
+
+/**
+ * ns_tree_add_raw - Add a namespace to a namespace
+ * @ns: Namespace to add
+ *
+ * This function adds a namespace to the appropriate namespace tree
+ * without assigning a id.
+ */
+#define ns_tree_add_raw(__ns) __ns_tree_add_raw(to_ns_common(__ns), to_ns_tree(__ns))
+
+/**
+ * ns_tree_add - Add a namespace to a namespace tree
+ * @ns: Namespace to add
+ *
+ * This function assigns a new id to the namespace and adds it to the
+ * appropriate namespace tree and list.
+ */
+#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns))
+
+/**
+ * ns_tree_remove - Remove a namespace from a namespace tree
+ * @ns: Namespace to remove
+ *
+ * This function removes a namespace from the appropriate namespace
+ * tree and list.
+ */
+#define ns_tree_remove(__ns) __ns_tree_remove(to_ns_common(__ns), to_ns_tree(__ns))
+
+#define ns_tree_adjoined_rcu(__ns, __previous) \
+ __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous)
+
+#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node))
+
+#endif /* _LINUX_NSTREE_H */
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index c2af02d70201..445517a72ad0 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -54,10 +54,15 @@ extern struct pid_namespace init_pid_ns;
#define PIDNS_ADDING (1U << 31)
#ifdef CONFIG_PID_NS
+static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct pid_namespace, ns);
+}
+
static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
if (ns != &init_pid_ns)
- refcount_inc(&ns->ns.count);
+ ns_ref_inc(ns);
return ns;
}
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 4b20375f3783..e81b8e596e4f 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -5,7 +5,7 @@
#ifndef _LINUX_PROC_NS_H
#define _LINUX_PROC_NS_H
-#include <linux/ns_common.h>
+#include <linux/nsfs.h>
#include <uapi/linux/nsfs.h>
struct pid_namespace;
@@ -17,7 +17,6 @@ struct inode;
struct proc_ns_operations {
const char *name;
const char *real_ns_name;
- int type;
struct ns_common *(*get)(struct task_struct *task);
void (*put)(struct ns_common *ns);
int (*install)(struct nsset *nsset, struct ns_common *ns);
@@ -66,25 +65,6 @@ static inline void proc_free_inum(unsigned int inum) {}
#endif /* CONFIG_PROC_FS */
-static inline int ns_alloc_inum(struct ns_common *ns)
-{
- WRITE_ONCE(ns->stashed, NULL);
- return proc_alloc_inum(&ns->inum);
-}
-
-#define ns_free_inum(ns) proc_free_inum((ns)->inum)
-
#define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
-extern int ns_get_path(struct path *path, struct task_struct *task,
- const struct proc_ns_operations *ns_ops);
-typedef struct ns_common *ns_get_path_helper_t(void *);
-extern int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb,
- void *private_data);
-
-extern bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino);
-
-extern int ns_get_name(char *buf, size_t size, struct task_struct *task,
- const struct proc_ns_operations *ns_ops);
-extern void nsfs_init(void);
#endif /* _LINUX_PROC_NS_H */
diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index b6e36525e0be..c514d0e5a45c 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -33,13 +33,18 @@ struct time_namespace {
extern struct time_namespace init_time_ns;
#ifdef CONFIG_TIME_NS
+static inline struct time_namespace *to_time_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct time_namespace, ns);
+}
+void __init time_ns_init(void);
extern int vdso_join_timens(struct task_struct *task,
struct time_namespace *ns);
extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns);
static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
- refcount_inc(&ns->ns.count);
+ ns_ref_inc(ns);
return ns;
}
@@ -52,7 +57,7 @@ struct page *find_timens_vvar_page(struct vm_area_struct *vma);
static inline void put_time_ns(struct time_namespace *ns)
{
- if (refcount_dec_and_test(&ns->ns.count))
+ if (ns_ref_put(ns))
free_time_ns(ns);
}
@@ -108,6 +113,10 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
}
#else
+static inline void __init time_ns_init(void)
+{
+}
+
static inline int vdso_join_timens(struct task_struct *task,
struct time_namespace *ns)
{
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index a0bb6d012137..9a9aebbf96b9 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -168,10 +168,15 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
#ifdef CONFIG_USER_NS
+static inline struct user_namespace *to_user_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct user_namespace, ns);
+}
+
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
if (ns)
- refcount_inc(&ns->ns.count);
+ ns_ref_inc(ns);
return ns;
}
@@ -181,7 +186,7 @@ extern void __put_user_ns(struct user_namespace *ns);
static inline void put_user_ns(struct user_namespace *ns)
{
- if (ns && refcount_dec_and_test(&ns->ns.count))
+ if (ns && ns_ref_put(ns))
__put_user_ns(ns);
}
diff --git a/include/linux/uts_namespace.h b/include/linux/uts_namespace.h
new file mode 100644
index 000000000000..60f37fec0f4b
--- /dev/null
+++ b/include/linux/uts_namespace.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UTS_NAMESPACE_H
+#define _LINUX_UTS_NAMESPACE_H
+
+#include <linux/ns_common.h>
+#include <uapi/linux/utsname.h>
+
+struct user_namespace;
+extern struct user_namespace init_user_ns;
+
+struct uts_namespace {
+ struct new_utsname name;
+ struct user_namespace *user_ns;
+ struct ucounts *ucounts;
+ struct ns_common ns;
+} __randomize_layout;
+
+extern struct uts_namespace init_uts_ns;
+
+#ifdef CONFIG_UTS_NS
+static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct uts_namespace, ns);
+}
+
+static inline void get_uts_ns(struct uts_namespace *ns)
+{
+ ns_ref_inc(ns);
+}
+
+extern struct uts_namespace *copy_utsname(u64 flags,
+ struct user_namespace *user_ns, struct uts_namespace *old_ns);
+extern void free_uts_ns(struct uts_namespace *ns);
+
+static inline void put_uts_ns(struct uts_namespace *ns)
+{
+ if (ns_ref_put(ns))
+ free_uts_ns(ns);
+}
+
+void uts_ns_init(void);
+#else
+static inline void get_uts_ns(struct uts_namespace *ns)
+{
+}
+
+static inline void put_uts_ns(struct uts_namespace *ns)
+{
+}
+
+static inline struct uts_namespace *copy_utsname(u64 flags,
+ struct user_namespace *user_ns, struct uts_namespace *old_ns)
+{
+ if (flags & CLONE_NEWUTS)
+ return ERR_PTR(-EINVAL);
+
+ return old_ns;
+}
+
+static inline void uts_ns_init(void)
+{
+}
+#endif
+
+#endif /* _LINUX_UTS_NAMESPACE_H */
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index ba34ec0e2f95..547bd4439706 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -7,7 +7,7 @@
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/err.h>
-#include <uapi/linux/utsname.h>
+#include <linux/uts_namespace.h>
enum uts_proc {
UTS_PROC_ARCH,
@@ -18,57 +18,6 @@ enum uts_proc {
UTS_PROC_DOMAINNAME,
};
-struct user_namespace;
-extern struct user_namespace init_user_ns;
-
-struct uts_namespace {
- struct new_utsname name;
- struct user_namespace *user_ns;
- struct ucounts *ucounts;
- struct ns_common ns;
-} __randomize_layout;
-extern struct uts_namespace init_uts_ns;
-
-#ifdef CONFIG_UTS_NS
-static inline void get_uts_ns(struct uts_namespace *ns)
-{
- refcount_inc(&ns->ns.count);
-}
-
-extern struct uts_namespace *copy_utsname(u64 flags,
- struct user_namespace *user_ns, struct uts_namespace *old_ns);
-extern void free_uts_ns(struct uts_namespace *ns);
-
-static inline void put_uts_ns(struct uts_namespace *ns)
-{
- if (refcount_dec_and_test(&ns->ns.count))
- free_uts_ns(ns);
-}
-
-void uts_ns_init(void);
-#else
-static inline void get_uts_ns(struct uts_namespace *ns)
-{
-}
-
-static inline void put_uts_ns(struct uts_namespace *ns)
-{
-}
-
-static inline struct uts_namespace *copy_utsname(u64 flags,
- struct user_namespace *user_ns, struct uts_namespace *old_ns)
-{
- if (flags & CLONE_NEWUTS)
- return ERR_PTR(-EINVAL);
-
- return old_ns;
-}
-
-static inline void uts_ns_init(void)
-{
-}
-#endif
-
#ifdef CONFIG_PROC_SYSCTL
extern void uts_proc_notify(enum uts_proc proc);
#else
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 0e008cfe159d..cb664f6e3558 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -262,10 +262,15 @@ void ipx_unregister_sysctl(void);
#ifdef CONFIG_NET_NS
void __put_net(struct net *net);
+static inline struct net *to_net_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct net, ns);
+}
+
/* Try using get_net_track() instead */
static inline struct net *get_net(struct net *net)
{
- refcount_inc(&net->ns.count);
+ ns_ref_inc(net);
return net;
}
@@ -276,7 +281,7 @@ static inline struct net *maybe_get_net(struct net *net)
* exists. If the reference count is zero this
* function fails and returns NULL.
*/
- if (!refcount_inc_not_zero(&net->ns.count))
+ if (!ns_ref_get(net))
net = NULL;
return net;
}
@@ -284,7 +289,7 @@ static inline struct net *maybe_get_net(struct net *net)
/* Try using put_net_track() instead */
static inline void put_net(struct net *net)
{
- if (refcount_dec_and_test(&net->ns.count))
+ if (ns_ref_put(net))
__put_net(net);
}
@@ -296,7 +301,7 @@ int net_eq(const struct net *net1, const struct net *net2)
static inline int check_net(const struct net *net)
{
- return refcount_read(&net->ns.count) != 0;
+ return ns_ref_read(net) != 0;
}
void net_drop_ns(void *);
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index f291ab4f94eb..3741ea1b73d8 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -111,6 +111,7 @@
#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */
#define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */
+#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */
#define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */
/* Generic flags for the *at(2) family of syscalls. */
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index 97d8d80d139f..e098759ec917 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -16,8 +16,6 @@
#define NS_GET_NSTYPE _IO(NSIO, 0x3)
/* Get owner UID (in the caller's user namespace) for a user namespace */
#define NS_GET_OWNER_UID _IO(NSIO, 0x4)
-/* Get the id for a mount namespace */
-#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64)
/* Translate pid from target pid namespace into the caller's pid namespace. */
#define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int)
/* Return thread-group leader id of pid in the callers pid namespace. */
@@ -42,6 +40,10 @@ struct mnt_ns_info {
/* Get previous namespace. */
#define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info)
+/* Retrieve namespace identifiers. */
+#define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64)
+#define NS_GET_ID _IOR(NSIO, 13, __u64)
+
enum init_ns_ino {
IPC_NS_INIT_INO = 0xEFFFFFFFU,
UTS_NS_INIT_INO = 0xEFFFFFFEU,
@@ -51,6 +53,18 @@ enum init_ns_ino {
TIME_NS_INIT_INO = 0xEFFFFFFAU,
NET_NS_INIT_INO = 0xEFFFFFF9U,
MNT_NS_INIT_INO = 0xEFFFFFF8U,
+#ifdef __KERNEL__
+ MNT_NS_ANON_INO = 0xEFFFFFF7U,
+#endif
+};
+
+struct nsfs_file_handle {
+ __u64 ns_id;
+ __u32 ns_type;
+ __u32 ns_inum;
};
+#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */
+#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */
+
#endif /* __LINUX_NSFS_H */
diff --git a/init/main.c b/init/main.c
index 5753e9539ae6..fab4f599c035 100644
--- a/init/main.c
+++ b/init/main.c
@@ -103,6 +103,7 @@
#include <linux/randomize_kstack.h>
#include <linux/pidfs.h>
#include <linux/ptdump.h>
+#include <linux/time_namespace.h>
#include <net/net_namespace.h>
#include <asm/io.h>
@@ -1072,6 +1073,7 @@ void start_kernel(void)
fork_init();
proc_caches_init();
uts_ns_init();
+ time_ns_init();
key_init();
security_init();
dbg_late_init();
diff --git a/init/version-timestamp.c b/init/version-timestamp.c
index 043cbf80a766..d071835121c2 100644
--- a/init/version-timestamp.c
+++ b/init/version-timestamp.c
@@ -8,7 +8,8 @@
#include <linux/utsname.h>
struct uts_namespace init_uts_ns = {
- .ns.count = REFCOUNT_INIT(2),
+ .ns.ns_type = ns_common_type(&init_uts_ns),
+ .ns.__ns_ref = REFCOUNT_INIT(2),
.name = {
.sysname = UTS_SYSNAME,
.nodename = UTS_NODENAME,
@@ -18,7 +19,7 @@ struct uts_namespace init_uts_ns = {
.domainname = UTS_DOMAINNAME,
},
.user_ns = &init_user_ns,
- .ns.inum = PROC_UTS_INIT_INO,
+ .ns.inum = ns_init_inum(&init_uts_ns),
#ifdef CONFIG_UTS_NS
.ns.ops = &utsns_operations,
#endif
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index c7be0c792647..7a03f6d03de3 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -15,6 +15,7 @@
#include <linux/proc_ns.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
+#include <linux/nstree.h>
#include "util.h"
@@ -26,12 +27,13 @@ DEFINE_SPINLOCK(mq_lock);
* and not CONFIG_IPC_NS.
*/
struct ipc_namespace init_ipc_ns = {
- .ns.count = REFCOUNT_INIT(1),
+ .ns.__ns_ref = REFCOUNT_INIT(1),
.user_ns = &init_user_ns,
- .ns.inum = PROC_IPC_INIT_INO,
+ .ns.inum = ns_init_inum(&init_ipc_ns),
#ifdef CONFIG_IPC_NS
.ns.ops = &ipcns_operations,
#endif
+ .ns.ns_type = ns_common_type(&init_ipc_ns),
};
struct msg_msgseg {
diff --git a/ipc/namespace.c b/ipc/namespace.c
index a712ec27209c..59b12fcb40bd 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -15,6 +15,7 @@
#include <linux/mount.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
+#include <linux/nstree.h>
#include <linux/sched/task.h>
#include "util.h"
@@ -61,12 +62,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
if (ns == NULL)
goto fail_dec;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto fail_free;
- ns->ns.ops = &ipcns_operations;
- refcount_set(&ns->ns.count, 1);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
@@ -87,6 +86,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
sem_init_ns(ns);
shm_init_ns(ns);
+ ns_tree_add(ns);
return ns;
@@ -97,7 +97,7 @@ fail_mq:
fail_put:
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
fail_free:
kfree(ns);
fail_dec:
@@ -161,7 +161,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
dec_ipc_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
kfree(ns);
}
@@ -199,20 +199,16 @@ static void free_ipc(struct work_struct *unused)
*/
void put_ipc_ns(struct ipc_namespace *ns)
{
- if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) {
+ if (ns_ref_put_and_lock(ns, &mq_lock)) {
mq_clear_sbinfo(ns);
spin_unlock(&mq_lock);
+ ns_tree_remove(ns);
if (llist_add(&ns->mnt_llist, &free_ipc_list))
schedule_work(&free_ipc_work);
}
}
-static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns)
-{
- return container_of(ns, struct ipc_namespace, ns);
-}
-
static struct ns_common *ipcns_get(struct task_struct *task)
{
struct ipc_namespace *ns = NULL;
@@ -252,7 +248,6 @@ static struct user_namespace *ipcns_owner(struct ns_common *ns)
const struct proc_ns_operations ipcns_operations = {
.name = "ipc",
- .type = CLONE_NEWIPC,
.get = ipcns_get,
.put = ipcns_put,
.install = ipcns_install,
diff --git a/ipc/shm.c b/ipc/shm.c
index a9310b6dbbc3..3db36773dd10 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -45,6 +45,7 @@
#include <linux/mount.h>
#include <linux/ipc_namespace.h>
#include <linux/rhashtable.h>
+#include <linux/nstree.h>
#include <linux/uaccess.h>
@@ -148,6 +149,7 @@ void shm_exit_ns(struct ipc_namespace *ns)
static int __init ipc_ns_init(void)
{
shm_init_ns(&init_ipc_ns);
+ ns_tree_add(&init_ipc_ns);
return 0;
}
diff --git a/kernel/Makefile b/kernel/Makefile
index c60623448235..1f48f7cd2d7b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \
sysctl.o capability.o ptrace.o user.o \
signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
- kthread.o sys_ni.o nsproxy.o \
+ kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 77d02f87f3f1..a0d5d62f1483 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -59,6 +59,7 @@
#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
#include <linux/psi.h>
+#include <linux/nstree.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -241,11 +242,12 @@ static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_D
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .ns.count = REFCOUNT_INIT(2),
+ .ns.__ns_ref = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
- .ns.inum = PROC_CGROUP_INIT_INO,
+ .ns.inum = ns_init_inum(&init_cgroup_ns),
.root_cset = &init_css_set,
+ .ns.ns_type = ns_common_type(&init_cgroup_ns),
};
static struct file_system_type cgroup2_fs_type;
@@ -6336,6 +6338,7 @@ int __init cgroup_init(void)
WARN_ON(register_filesystem(&cpuset_fs_type));
#endif
+ ns_tree_add(&init_cgroup_ns);
return 0;
}
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index dedadb525880..fdbe57578e68 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -5,7 +5,7 @@
#include <linux/slab.h>
#include <linux/nsproxy.h>
#include <linux/proc_ns.h>
-
+#include <linux/nstree.h>
/* cgroup namespaces */
@@ -21,29 +21,28 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts)
static struct cgroup_namespace *alloc_cgroup_ns(void)
{
- struct cgroup_namespace *new_ns;
+ struct cgroup_namespace *new_ns __free(kfree) = NULL;
int ret;
new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns)
return ERR_PTR(-ENOMEM);
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
+ ret = ns_common_init(new_ns);
+ if (ret)
return ERR_PTR(ret);
- }
- refcount_set(&new_ns->ns.count, 1);
- new_ns->ns.ops = &cgroupns_operations;
- return new_ns;
+ ns_tree_add(new_ns);
+ return no_free_ptr(new_ns);
}
void free_cgroup_ns(struct cgroup_namespace *ns)
{
+ ns_tree_remove(ns);
put_css_set(ns->root_cset);
dec_cgroup_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
- kfree(ns);
+ ns_common_free(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
EXPORT_SYMBOL(free_cgroup_ns);
@@ -90,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(u64 flags,
return new_ns;
}
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
- return container_of(ns, struct cgroup_namespace, ns);
-}
-
static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
{
struct nsproxy *nsproxy = nsset->nsproxy;
@@ -143,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns)
const struct proc_ns_operations cgroupns_operations = {
.name = "cgroup",
- .type = CLONE_NEWCGROUP,
.get = cgroupns_get,
.put = cgroupns_put,
.install = cgroupns_install,
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
new file mode 100644
index 000000000000..c1fb2bad6d72
--- /dev/null
+++ b/kernel/nscommon.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ns_common.h>
+#include <linux/proc_ns.h>
+#include <linux/vfsdebug.h>
+
+#ifdef CONFIG_DEBUG_VFS
+static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
+{
+ switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+ case CLONE_NEWCGROUP:
+ VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
+ break;
+#endif
+#ifdef CONFIG_IPC_NS
+ case CLONE_NEWIPC:
+ VFS_WARN_ON_ONCE(ops != &ipcns_operations);
+ break;
+#endif
+ case CLONE_NEWNS:
+ VFS_WARN_ON_ONCE(ops != &mntns_operations);
+ break;
+#ifdef CONFIG_NET_NS
+ case CLONE_NEWNET:
+ VFS_WARN_ON_ONCE(ops != &netns_operations);
+ break;
+#endif
+#ifdef CONFIG_PID_NS
+ case CLONE_NEWPID:
+ VFS_WARN_ON_ONCE(ops != &pidns_operations);
+ break;
+#endif
+#ifdef CONFIG_TIME_NS
+ case CLONE_NEWTIME:
+ VFS_WARN_ON_ONCE(ops != &timens_operations);
+ break;
+#endif
+#ifdef CONFIG_USER_NS
+ case CLONE_NEWUSER:
+ VFS_WARN_ON_ONCE(ops != &userns_operations);
+ break;
+#endif
+#ifdef CONFIG_UTS_NS
+ case CLONE_NEWUTS:
+ VFS_WARN_ON_ONCE(ops != &utsns_operations);
+ break;
+#endif
+ }
+}
+#endif
+
+int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
+{
+ refcount_set(&ns->__ns_ref, 1);
+ ns->stashed = NULL;
+ ns->ops = ops;
+ ns->ns_id = 0;
+ ns->ns_type = ns_type;
+ RB_CLEAR_NODE(&ns->ns_tree_node);
+ INIT_LIST_HEAD(&ns->ns_list_node);
+
+#ifdef CONFIG_DEBUG_VFS
+ ns_debug(ns, ops);
+#endif
+
+ if (inum) {
+ ns->inum = inum;
+ return 0;
+ }
+ return proc_alloc_inum(&ns->inum);
+}
+
+void __ns_common_free(struct ns_common *ns)
+{
+ proc_free_inum(ns->inum);
+}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8af3b9ec3aa8..19aa64ab08c8 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -545,9 +545,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
if (proc_ns_file(fd_file(f))) {
ns = get_proc_ns(file_inode(fd_file(f)));
- if (flags && (ns->ops->type != flags))
+ if (flags && (ns->ns_type != flags))
err = -EINVAL;
- flags = ns->ops->type;
+ flags = ns->ns_type;
} else if (!IS_ERR(pidfd_pid(fd_file(f)))) {
err = check_setns_flags(flags);
} else {
diff --git a/kernel/nstree.c b/kernel/nstree.c
new file mode 100644
index 000000000000..b24a320a11a6
--- /dev/null
+++ b/kernel/nstree.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/nstree.h>
+#include <linux/proc_ns.h>
+#include <linux/vfsdebug.h>
+
+/**
+ * struct ns_tree - Namespace tree
+ * @ns_tree: Rbtree of namespaces of a particular type
+ * @ns_list: Sequentially walkable list of all namespaces of this type
+ * @ns_tree_lock: Seqlock to protect the tree and list
+ * @type: type of namespaces in this tree
+ */
+struct ns_tree {
+ struct rb_root ns_tree;
+ struct list_head ns_list;
+ seqlock_t ns_tree_lock;
+ int type;
+};
+
+struct ns_tree mnt_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWNS,
+};
+
+struct ns_tree net_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWNET,
+};
+EXPORT_SYMBOL_GPL(net_ns_tree);
+
+struct ns_tree uts_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWUTS,
+};
+
+struct ns_tree user_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWUSER,
+};
+
+struct ns_tree ipc_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWIPC,
+};
+
+struct ns_tree pid_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWPID,
+};
+
+struct ns_tree cgroup_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWCGROUP,
+};
+
+struct ns_tree time_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWTIME,
+};
+
+DEFINE_COOKIE(namespace_cookie);
+
+static inline struct ns_common *node_to_ns(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_tree_node);
+}
+
+static inline int ns_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ struct ns_common *ns_a = node_to_ns(a);
+ struct ns_common *ns_b = node_to_ns(b);
+ u64 ns_id_a = ns_a->ns_id;
+ u64 ns_id_b = ns_b->ns_id;
+
+ if (ns_id_a < ns_id_b)
+ return -1;
+ if (ns_id_a > ns_id_b)
+ return 1;
+ return 0;
+}
+
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
+{
+ struct rb_node *node, *prev;
+
+ VFS_WARN_ON_ONCE(!ns->ns_id);
+
+ write_seqlock(&ns_tree->ns_tree_lock);
+
+ VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+
+ node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp);
+ /*
+ * If there's no previous entry simply add it after the
+ * head and if there is add it after the previous entry.
+ */
+ prev = rb_prev(&ns->ns_tree_node);
+ if (!prev)
+ list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list);
+ else
+ list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node);
+
+ write_sequnlock(&ns_tree->ns_tree_lock);
+
+ VFS_WARN_ON_ONCE(node);
+}
+
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
+{
+ VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node));
+ VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node));
+ VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+
+ write_seqlock(&ns_tree->ns_tree_lock);
+ rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree);
+ list_bidir_del_rcu(&ns->ns_list_node);
+ RB_CLEAR_NODE(&ns->ns_tree_node);
+ write_sequnlock(&ns_tree->ns_tree_lock);
+}
+EXPORT_SYMBOL_GPL(__ns_tree_remove);
+
+static int ns_find(const void *key, const struct rb_node *node)
+{
+ const u64 ns_id = *(u64 *)key;
+ const struct ns_common *ns = node_to_ns(node);
+
+ if (ns_id < ns->ns_id)
+ return -1;
+ if (ns_id > ns->ns_id)
+ return 1;
+ return 0;
+}
+
+
+static struct ns_tree *ns_tree_from_type(int ns_type)
+{
+ switch (ns_type) {
+ case CLONE_NEWCGROUP:
+ return &cgroup_ns_tree;
+ case CLONE_NEWIPC:
+ return &ipc_ns_tree;
+ case CLONE_NEWNS:
+ return &mnt_ns_tree;
+ case CLONE_NEWNET:
+ return &net_ns_tree;
+ case CLONE_NEWPID:
+ return &pid_ns_tree;
+ case CLONE_NEWUSER:
+ return &user_ns_tree;
+ case CLONE_NEWUTS:
+ return &uts_ns_tree;
+ case CLONE_NEWTIME:
+ return &time_ns_tree;
+ }
+
+ return NULL;
+}
+
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ struct ns_tree *ns_tree;
+ struct rb_node *node;
+ unsigned int seq;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
+
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return NULL;
+
+ do {
+ seq = read_seqbegin(&ns_tree->ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find);
+ if (node)
+ break;
+ } while (read_seqretry(&ns_tree->ns_tree_lock, seq));
+
+ if (!node)
+ return NULL;
+
+ VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type);
+
+ return node_to_ns(node);
+}
+
+/**
+ * ns_tree_adjoined_rcu - find the next/previous namespace in the same
+ * tree
+ * @ns: namespace to start from
+ * @previous: if true find the previous namespace, otherwise the next
+ *
+ * Find the next or previous namespace in the same tree as @ns. If
+ * there is no next/previous namespace, -ENOENT is returned.
+ */
+struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
+ struct ns_tree *ns_tree, bool previous)
+{
+ struct list_head *list;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage");
+
+ if (previous)
+ list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node));
+ else
+ list = rcu_dereference(list_next_rcu(&ns->ns_list_node));
+ if (list_is_head(list, &ns_tree->ns_list))
+ return ERR_PTR(-ENOENT);
+
+ VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type);
+
+ return list_entry_rcu(list, struct ns_common, ns_list_node);
+}
+
+/**
+ * ns_tree_gen_id - generate a new namespace id
+ * @ns: namespace to generate id for
+ *
+ * Generates a new namespace id and assigns it to the namespace. All
+ * namespaces types share the same id space and thus can be compared
+ * directly. IOW, when two ids of two namespace are equal, they are
+ * identical.
+ */
+u64 ns_tree_gen_id(struct ns_common *ns)
+{
+ guard(preempt)();
+ ns->ns_id = gen_cookie_next(&namespace_cookie);
+ return ns->ns_id;
+}
diff --git a/kernel/pid.c b/kernel/pid.c
index 2dbcc4dd90cc..4fffec767a63 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -71,13 +71,13 @@ static int pid_max_max = PID_MAX_LIMIT;
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
- .ns.count = REFCOUNT_INIT(2),
+ .ns.__ns_ref = REFCOUNT_INIT(2),
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
- .ns.inum = PROC_PID_INIT_INO,
+ .ns.inum = ns_init_inum(&init_pid_ns),
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
@@ -85,6 +85,7 @@ struct pid_namespace init_pid_ns = {
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
+ .ns.ns_type = ns_common_type(&init_pid_ns),
};
EXPORT_SYMBOL_GPL(init_pid_ns);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index d315c89e4d62..650be58d8d18 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/idr.h>
+#include <linux/nstree.h>
#include <uapi/linux/wait.h>
#include "pid_sysctl.h"
@@ -102,17 +103,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns->pid_cachep == NULL)
goto out_free_idr;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto out_free_idr;
- ns->ns.ops = &pidns_operations;
ns->pid_max = PID_MAX_LIMIT;
err = register_pidns_sysctls(ns);
if (err)
goto out_free_inum;
- refcount_set(&ns->ns.count, 1);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
@@ -124,10 +123,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif
+ ns_tree_add(ns);
return ns;
out_free_inum:
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
out_free_idr:
idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
@@ -149,9 +149,10 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
+ ns_tree_remove(ns);
unregister_pidns_sysctls(ns);
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
@@ -168,7 +169,7 @@ static void destroy_pid_namespace_work(struct work_struct *work)
parent = ns->parent;
destroy_pid_namespace(ns);
ns = parent;
- } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
+ } while (ns != &init_pid_ns && ns_ref_put(ns));
}
struct pid_namespace *copy_pid_ns(u64 flags,
@@ -183,7 +184,7 @@ struct pid_namespace *copy_pid_ns(u64 flags,
void put_pid_ns(struct pid_namespace *ns)
{
- if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
+ if (ns && ns != &init_pid_ns && ns_ref_put(ns))
schedule_work(&ns->work);
}
EXPORT_SYMBOL_GPL(put_pid_ns);
@@ -344,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
return 0;
}
-static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
-{
- return container_of(ns, struct pid_namespace, ns);
-}
-
static struct ns_common *pidns_get(struct task_struct *task)
{
struct pid_namespace *ns;
@@ -453,7 +449,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns)
const struct proc_ns_operations pidns_operations = {
.name = "pid",
- .type = CLONE_NEWPID,
.get = pidns_get,
.put = pidns_put,
.install = pidns_install,
@@ -464,7 +459,6 @@ const struct proc_ns_operations pidns_operations = {
const struct proc_ns_operations pidns_for_children_operations = {
.name = "pid_for_children",
.real_ns_name = "pid",
- .type = CLONE_NEWPID,
.get = pidns_for_children_get,
.put = pidns_put,
.install = pidns_install,
@@ -481,6 +475,7 @@ static __init int pid_namespaces_init(void)
#endif
register_pid_ns_sysctl_table_vm();
+ ns_tree_add(&init_pid_ns);
return 0;
}
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 888872bcc5bb..5b6997f4dc3d 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include <linux/proc_ns.h>
#include <linux/export.h>
+#include <linux/nstree.h>
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/cred.h>
@@ -88,25 +89,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
+ ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
if (!ns)
goto fail_dec;
- refcount_set(&ns->ns.count, 1);
-
ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!ns->vvar_page)
goto fail_free;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto fail_free_page;
ns->ucounts = ucounts;
- ns->ns.ops = &timens_operations;
ns->user_ns = get_user_ns(user_ns);
ns->offsets = old_ns->offsets;
ns->frozen_offsets = false;
+ ns_tree_add(ns);
return ns;
fail_free_page:
@@ -253,16 +252,13 @@ out:
void free_time_ns(struct time_namespace *ns)
{
+ ns_tree_remove(ns);
dec_time_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
__free_page(ns->vvar_page);
- kfree(ns);
-}
-
-static struct time_namespace *to_time_ns(struct ns_common *ns)
-{
- return container_of(ns, struct time_namespace, ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *timens_get(struct task_struct *task)
@@ -466,7 +462,6 @@ out:
const struct proc_ns_operations timens_operations = {
.name = "time",
- .type = CLONE_NEWTIME,
.get = timens_get,
.put = timens_put,
.install = timens_install,
@@ -476,7 +471,6 @@ const struct proc_ns_operations timens_operations = {
const struct proc_ns_operations timens_for_children_operations = {
.name = "time_for_children",
.real_ns_name = "time",
- .type = CLONE_NEWTIME,
.get = timens_for_children_get,
.put = timens_put,
.install = timens_install,
@@ -484,9 +478,15 @@ const struct proc_ns_operations timens_for_children_operations = {
};
struct time_namespace init_time_ns = {
- .ns.count = REFCOUNT_INIT(3),
+ .ns.ns_type = ns_common_type(&init_time_ns),
+ .ns.__ns_ref = REFCOUNT_INIT(3),
.user_ns = &init_user_ns,
- .ns.inum = PROC_TIME_INIT_INO,
+ .ns.inum = ns_init_inum(&init_time_ns),
.ns.ops = &timens_operations,
.frozen_offsets = true,
};
+
+void __init time_ns_init(void)
+{
+ ns_tree_add(&init_time_ns);
+}
diff --git a/kernel/user.c b/kernel/user.c
index f46b1d41163b..0163665914c9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -65,10 +65,11 @@ struct user_namespace init_user_ns = {
.nr_extents = 1,
},
},
- .ns.count = REFCOUNT_INIT(3),
+ .ns.ns_type = ns_common_type(&init_user_ns),
+ .ns.__ns_ref = REFCOUNT_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
- .ns.inum = PROC_USER_INIT_INO,
+ .ns.inum = ns_init_inum(&init_user_ns),
#ifdef CONFIG_USER_NS
.ns.ops = &userns_operations,
#endif
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 682f40d5632d..03cb63883d04 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -21,6 +21,7 @@
#include <linux/fs_struct.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
+#include <linux/nstree.h>
static struct kmem_cache *user_ns_cachep __ro_after_init;
static DEFINE_MUTEX(userns_state_mutex);
@@ -124,12 +125,11 @@ int create_user_ns(struct cred *new)
goto fail_dec;
ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
- ret = ns_alloc_inum(&ns->ns);
+
+ ret = ns_common_init(ns);
if (ret)
goto fail_free;
- ns->ns.ops = &userns_operations;
- refcount_set(&ns->ns.count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
ns->parent = parent_ns;
ns->level = parent_ns->level + 1;
@@ -159,12 +159,13 @@ int create_user_ns(struct cred *new)
goto fail_keyring;
set_cred_user_ns(new, ns);
+ ns_tree_add(ns);
return 0;
fail_keyring:
#ifdef CONFIG_PERSISTENT_KEYRINGS
key_put(ns->persistent_keyring_register);
#endif
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
fail_free:
kmem_cache_free(user_ns_cachep, ns);
fail_dec:
@@ -201,6 +202,7 @@ static void free_user_ns(struct work_struct *work)
do {
struct ucounts *ucounts = ns->ucounts;
parent = ns->parent;
+ ns_tree_remove(ns);
if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
kfree(ns->gid_map.forward);
kfree(ns->gid_map.reverse);
@@ -218,11 +220,12 @@ static void free_user_ns(struct work_struct *work)
#endif
retire_userns_sysctls(ns);
key_free_user_ns(ns);
- ns_free_inum(&ns->ns);
- kmem_cache_free(user_ns_cachep, ns);
+ ns_common_free(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
dec_user_namespaces(ucounts);
ns = parent;
- } while (refcount_dec_and_test(&parent->ns.count));
+ } while (ns_ref_put(parent));
}
void __put_user_ns(struct user_namespace *ns)
@@ -1322,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns)
}
EXPORT_SYMBOL(current_in_userns);
-static inline struct user_namespace *to_user_ns(struct ns_common *ns)
-{
- return container_of(ns, struct user_namespace, ns);
-}
-
static struct ns_common *userns_get(struct task_struct *task)
{
struct user_namespace *user_ns;
@@ -1402,7 +1400,6 @@ static struct user_namespace *userns_owner(struct ns_common *ns)
const struct proc_ns_operations userns_operations = {
.name = "user",
- .type = CLONE_NEWUSER,
.get = userns_get,
.put = userns_put,
.install = userns_install,
@@ -1413,6 +1410,7 @@ const struct proc_ns_operations userns_operations = {
static __init int user_namespaces_init(void)
{
user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
+ ns_tree_add(&init_user_ns);
return 0;
}
subsys_initcall(user_namespaces_init);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 00d8d7922f86..ebbfc578a9d3 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -13,6 +13,7 @@
#include <linux/cred.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
+#include <linux/nstree.h>
#include <linux/sched/task.h>
static struct kmem_cache *uts_ns_cache __ro_after_init;
@@ -27,16 +28,6 @@ static void dec_uts_namespaces(struct ucounts *ucounts)
dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
}
-static struct uts_namespace *create_uts_ns(void)
-{
- struct uts_namespace *uts_ns;
-
- uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL);
- if (uts_ns)
- refcount_set(&uts_ns->ns.count, 1);
- return uts_ns;
-}
-
/*
* Clone a new ns copying an original utsname, setting refcount to 1
* @old_ns: namespace to clone
@@ -55,21 +46,20 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = create_uts_ns();
+ ns = kmem_cache_zalloc(uts_ns_cache, GFP_KERNEL);
if (!ns)
goto fail_dec;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto fail_free;
ns->ucounts = ucounts;
- ns->ns.ops = &utsns_operations;
-
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
ns->user_ns = get_user_ns(user_ns);
up_read(&uts_sem);
+ ns_tree_add(ns);
return ns;
fail_free:
@@ -105,15 +95,12 @@ struct uts_namespace *copy_utsname(u64 flags,
void free_uts_ns(struct uts_namespace *ns)
{
+ ns_tree_remove(ns);
dec_uts_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
- kmem_cache_free(uts_ns_cache, ns);
-}
-
-static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
-{
- return container_of(ns, struct uts_namespace, ns);
+ ns_common_free(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *utsns_get(struct task_struct *task)
@@ -159,7 +146,6 @@ static struct user_namespace *utsns_owner(struct ns_common *ns)
const struct proc_ns_operations utsns_operations = {
.name = "uts",
- .type = CLONE_NEWUTS,
.get = utsns_get,
.put = utsns_put,
.install = utsns_install,
@@ -174,4 +160,5 @@ void __init uts_ns_init(void)
offsetof(struct uts_namespace, name),
sizeof_field(struct uts_namespace, name),
NULL);
+ ns_tree_add(&init_uts_ns);
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c28cd6665444..3c2dc4c5e683 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1328,7 +1328,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
struct netdev_rx_queue *queue = &dev->_rx[i];
struct kobject *kobj = &queue->kobj;
- if (!refcount_read(&dev_net(dev)->ns.count))
+ if (!check_net(dev_net(dev)))
kobj->uevent_suppress = 1;
if (dev->sysfs_rx_queue_group)
sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
@@ -2061,7 +2061,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
while (--i >= new_num) {
struct netdev_queue *queue = dev->_tx + i;
- if (!refcount_read(&dev_net(dev)->ns.count))
+ if (!check_net(dev_net(dev)))
queue->kobj.uevent_suppress = 1;
if (netdev_uses_bql(dev))
@@ -2315,7 +2315,7 @@ void netdev_unregister_kobject(struct net_device *ndev)
{
struct device *dev = &ndev->dev;
- if (!refcount_read(&dev_net(ndev)->ns.count))
+ if (!check_net(dev_net(ndev)))
dev_set_uevent_suppress(dev, 1);
kobject_get(&dev->kobj);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 8ec9d83475bf..b0e0f22d7b21 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -20,6 +20,7 @@
#include <linux/sched/task.h>
#include <linux/uidgid.h>
#include <linux/proc_fs.h>
+#include <linux/nstree.h>
#include <net/aligned_data.h>
#include <net/sock.h>
@@ -314,7 +315,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
{
int id;
- if (refcount_read(&net->ns.count) == 0)
+ if (!check_net(net))
return NETNSA_NSID_NOT_ASSIGNED;
spin_lock(&net->nsid_lock);
@@ -397,10 +398,15 @@ static __net_init void preinit_net_sysctl(struct net *net)
}
/* init code that must occur even if setup_net() is not called. */
-static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns)
+static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns)
{
+ int ret;
+
+ ret = ns_common_init(net);
+ if (ret)
+ return ret;
+
refcount_set(&net->passive, 1);
- refcount_set(&net->ns.count, 1);
ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt");
ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt");
@@ -420,6 +426,7 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_
INIT_LIST_HEAD(&net->ptype_all);
INIT_LIST_HEAD(&net->ptype_specific);
preinit_net_sysctl(net);
+ return 0;
}
/*
@@ -432,7 +439,7 @@ static __net_init int setup_net(struct net *net)
LIST_HEAD(net_exit_list);
int error = 0;
- net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie);
+ net->net_cookie = ns_tree_gen_id(&net->ns);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
@@ -442,6 +449,7 @@ static __net_init int setup_net(struct net *net)
down_write(&net_rwsem);
list_add_tail_rcu(&net->list, &net_namespace_list);
up_write(&net_rwsem);
+ ns_tree_add_raw(net);
out:
return error;
@@ -559,7 +567,9 @@ struct net *copy_net_ns(u64 flags,
goto dec_ucounts;
}
- preinit_net(net, user_ns);
+ rv = preinit_net(net, user_ns);
+ if (rv < 0)
+ goto dec_ucounts;
net->ucounts = ucounts;
get_user_ns(user_ns);
@@ -573,6 +583,7 @@ struct net *copy_net_ns(u64 flags,
if (rv < 0) {
put_userns:
+ ns_common_free(net);
#ifdef CONFIG_KEYS
key_remove_domain(net->key_domain);
#endif
@@ -659,8 +670,10 @@ static void cleanup_net(struct work_struct *work)
/* Don't let anyone else find us. */
down_write(&net_rwsem);
- llist_for_each_entry(net, net_kill_list, cleanup_list)
+ llist_for_each_entry(net, net_kill_list, cleanup_list) {
+ ns_tree_remove(net);
list_del_rcu(&net->list);
+ }
/* Cache last net. After we unlock rtnl, no one new net
* added to net_namespace_list can assign nsid pointer
* to a net from net_kill_list (see peernet2id_alloc()).
@@ -693,6 +706,7 @@ static void cleanup_net(struct work_struct *work)
/* Finally it is safe to free my network namespace structure */
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
list_del_init(&net->exit_list);
+ ns_common_free(net);
dec_net_namespaces(net->ucounts);
#ifdef CONFIG_KEYS
key_remove_domain(net->key_domain);
@@ -812,31 +826,12 @@ static void net_ns_net_debugfs(struct net *net)
static __net_init int net_ns_net_init(struct net *net)
{
-#ifdef CONFIG_NET_NS
- net->ns.ops = &netns_operations;
-#endif
- net->ns.inum = PROC_NET_INIT_INO;
- if (net != &init_net) {
- int ret = ns_alloc_inum(&net->ns);
- if (ret)
- return ret;
- }
net_ns_net_debugfs(net);
return 0;
}
-static __net_exit void net_ns_net_exit(struct net *net)
-{
- /*
- * Initial network namespace doesn't exit so we don't need any
- * special checks here.
- */
- ns_free_inum(&net->ns);
-}
-
static struct pernet_operations __net_initdata net_ns_ops = {
.init = net_ns_net_init,
- .exit = net_ns_net_exit,
};
static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
@@ -1282,7 +1277,12 @@ void __init net_ns_init(void)
#ifdef CONFIG_KEYS
init_net.key_domain = &init_net_key_domain;
#endif
- preinit_net(&init_net, &init_user_ns);
+ /*
+ * This currently cannot fail as the initial network namespace
+ * has a static inode number.
+ */
+ if (preinit_net(&init_net, &init_user_ns))
+ panic("Could not preinitialize the initial network namespace");
down_write(&pernet_ops_rwsem);
if (setup_net(&init_net))
@@ -1517,11 +1517,6 @@ static struct ns_common *netns_get(struct task_struct *task)
return net ? &net->ns : NULL;
}
-static inline struct net *to_net_ns(struct ns_common *ns)
-{
- return container_of(ns, struct net, ns);
-}
-
static void netns_put(struct ns_common *ns)
{
put_net(to_net_ns(ns));
@@ -1548,7 +1543,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns)
const struct proc_ns_operations netns_operations = {
.name = "net",
- .type = CLONE_NEWNET,
.get = netns_get,
.put = netns_put,
.install = netns_install,
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 875ff923a8ed..56a117560c0c 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -329,13 +329,13 @@ restart:
TCPF_NEW_SYN_RECV))
continue;
- if (refcount_read(&sock_net(sk)->ns.count))
+ if (check_net(sock_net(sk)))
continue;
if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
continue;
- if (refcount_read(&sock_net(sk)->ns.count)) {
+ if (check_net(sock_net(sk))) {
sock_gen_put(sk);
goto restart;
}
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 03c068ea27b6..b67f94c60f9f 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -912,7 +912,7 @@ static void tcp_metrics_flush_all(struct net *net)
spin_lock_bh(&tcp_metrics_lock);
for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
match = net ? net_eq(tm_net(tm), net) :
- !refcount_read(&tm_net(tm)->ns.count);
+ !check_net(tm_net(tm));
if (match) {
rcu_assign_pointer(*pp, tm->tcpm_next);
kfree_rcu(tm, rcu_head);
diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h
index 34127653fd00..33c9b578b3b2 100644
--- a/tools/include/uapi/linux/nsfs.h
+++ b/tools/include/uapi/linux/nsfs.h
@@ -16,8 +16,6 @@
#define NS_GET_NSTYPE _IO(NSIO, 0x3)
/* Get owner UID (in the caller's user namespace) for a user namespace */
#define NS_GET_OWNER_UID _IO(NSIO, 0x4)
-/* Get the id for a mount namespace */
-#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64)
/* Translate pid from target pid namespace into the caller's pid namespace. */
#define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int)
/* Return thread-group leader id of pid in the callers pid namespace. */
@@ -42,4 +40,19 @@ struct mnt_ns_info {
/* Get previous namespace. */
#define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info)
+/* Retrieve namespace identifiers. */
+#define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64)
+#define NS_GET_ID _IOR(NSIO, 13, __u64)
+
+enum init_ns_ino {
+ IPC_NS_INIT_INO = 0xEFFFFFFFU,
+ UTS_NS_INIT_INO = 0xEFFFFFFEU,
+ USER_NS_INIT_INO = 0xEFFFFFFDU,
+ PID_NS_INIT_INO = 0xEFFFFFFCU,
+ CGROUP_NS_INIT_INO = 0xEFFFFFFBU,
+ TIME_NS_INIT_INO = 0xEFFFFFFAU,
+ NET_NS_INIT_INO = 0xEFFFFFF9U,
+ MNT_NS_INIT_INO = 0xEFFFFFF8U,
+};
+
#endif /* __LINUX_NSFS_H */
diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore
new file mode 100644
index 000000000000..ccfb40837a73
--- /dev/null
+++ b/tools/testing/selftests/namespaces/.gitignore
@@ -0,0 +1,3 @@
+nsid_test
+file_handle_test
+init_ino_test
diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile
new file mode 100644
index 000000000000..5fe4b3dc07d3
--- /dev/null
+++ b/tools/testing/selftests/namespaces/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+
+TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/namespaces/config b/tools/testing/selftests/namespaces/config
new file mode 100644
index 000000000000..d09836260262
--- /dev/null
+++ b/tools/testing/selftests/namespaces/config
@@ -0,0 +1,7 @@
+CONFIG_UTS_NS=y
+CONFIG_TIME_NS=y
+CONFIG_IPC_NS=y
+CONFIG_USER_NS=y
+CONFIG_PID_NS=y
+CONFIG_NET_NS=y
+CONFIG_CGROUPS=y
diff --git a/tools/testing/selftests/namespaces/file_handle_test.c b/tools/testing/selftests/namespaces/file_handle_test.c
new file mode 100644
index 000000000000..f1bc5773f552
--- /dev/null
+++ b/tools/testing/selftests/namespaces/file_handle_test.c
@@ -0,0 +1,1429 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <grp.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/unistd.h>
+#include "../kselftest_harness.h"
+
+#ifndef FD_NSFS_ROOT
+#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */
+#endif
+
+TEST(nsfs_net_handle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ struct stat st1, st2;
+
+ /* Drop to unprivileged uid/gid */
+ ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+ ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Open a namespace file descriptor */
+ ns_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(ns_fd, 0);
+
+ /* Get handle for the namespace */
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ ASSERT_GT(handle->handle_bytes, 0);
+
+ /* Try to open using FD_NSFS_ROOT as unprivileged user */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+ SKIP(free(handle); close(ns_fd);
+ return,
+ "open_by_handle_at with FD_NSFS_ROOT not supported");
+ }
+ if (fd < 0 && errno == EPERM) {
+ SKIP(free(handle); close(ns_fd);
+ return,
+ "Permission denied for unprivileged user (expected)");
+ }
+ ASSERT_GE(fd, 0);
+
+ /* Verify we opened the correct namespace */
+ ASSERT_EQ(fstat(ns_fd, &st1), 0);
+ ASSERT_EQ(fstat(fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ close(fd);
+ close(ns_fd);
+ free(handle);
+}
+
+TEST(nsfs_uts_handle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ struct stat st1, st2;
+
+ /* Drop to unprivileged uid/gid */
+ ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+ ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Open UTS namespace file descriptor */
+ ns_fd = open("/proc/self/ns/uts", O_RDONLY);
+ ASSERT_GE(ns_fd, 0);
+
+ /* Get handle for the namespace */
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ ASSERT_GT(handle->handle_bytes, 0);
+
+ /* Try to open using FD_NSFS_ROOT */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+ SKIP(free(handle); close(ns_fd);
+ return,
+ "open_by_handle_at with FD_NSFS_ROOT not supported");
+ }
+ ASSERT_GE(fd, 0);
+
+ /* Verify we opened the correct namespace */
+ ASSERT_EQ(fstat(ns_fd, &st1), 0);
+ ASSERT_EQ(fstat(fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ close(fd);
+ close(ns_fd);
+ free(handle);
+}
+
+TEST(nsfs_ipc_handle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ struct stat st1, st2;
+
+ /* Drop to unprivileged uid/gid */
+ ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+ ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Open IPC namespace file descriptor */
+ ns_fd = open("/proc/self/ns/ipc", O_RDONLY);
+ ASSERT_GE(ns_fd, 0);
+
+ /* Get handle for the namespace */
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ ASSERT_GT(handle->handle_bytes, 0);
+
+ /* Try to open using FD_NSFS_ROOT */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+ SKIP(free(handle); close(ns_fd);
+ return,
+ "open_by_handle_at with FD_NSFS_ROOT not supported");
+ }
+ ASSERT_GE(fd, 0);
+
+ /* Verify we opened the correct namespace */
+ ASSERT_EQ(fstat(ns_fd, &st1), 0);
+ ASSERT_EQ(fstat(fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ close(fd);
+ close(ns_fd);
+ free(handle);
+}
+
+TEST(nsfs_pid_handle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ struct stat st1, st2;
+
+ /* Drop to unprivileged uid/gid */
+ ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+ ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Open PID namespace file descriptor */
+ ns_fd = open("/proc/self/ns/pid", O_RDONLY);
+ ASSERT_GE(ns_fd, 0);
+
+ /* Get handle for the namespace */
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ ASSERT_GT(handle->handle_bytes, 0);
+
+ /* Try to open using FD_NSFS_ROOT */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+ SKIP(free(handle); close(ns_fd);
+ return,
+ "open_by_handle_at with FD_NSFS_ROOT not supported");
+ }
+ ASSERT_GE(fd, 0);
+
+ /* Verify we opened the correct namespace */
+ ASSERT_EQ(fstat(ns_fd, &st1), 0);
+ ASSERT_EQ(fstat(fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ close(fd);
+ close(ns_fd);
+ free(handle);
+}
+
+TEST(nsfs_mnt_handle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ struct stat st1, st2;
+
+ /* Drop to unprivileged uid/gid */
+ ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+ ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Open mount namespace file descriptor */
+ ns_fd = open("/proc/self/ns/mnt", O_RDONLY);
+ ASSERT_GE(ns_fd, 0);
+
+ /* Get handle for the namespace */
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ ASSERT_GT(handle->handle_bytes, 0);
+
+ /* Try to open using FD_NSFS_ROOT */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+ SKIP(free(handle); close(ns_fd);
+ return,
+ "open_by_handle_at with FD_NSFS_ROOT not supported");
+ }
+ ASSERT_GE(fd, 0);
+
+ /* Verify we opened the correct namespace */
+ ASSERT_EQ(fstat(ns_fd, &st1), 0);
+ ASSERT_EQ(fstat(fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ close(fd);
+ close(ns_fd);
+ free(handle);
+}
+
+TEST(nsfs_user_handle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ struct stat st1, st2;
+
+ /* Drop to unprivileged uid/gid */
+ ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+ ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Open user namespace file descriptor */
+ ns_fd = open("/proc/self/ns/user", O_RDONLY);
+ ASSERT_GE(ns_fd, 0);
+
+ /* Get handle for the namespace */
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ ASSERT_GT(handle->handle_bytes, 0);
+
+ /* Try to open using FD_NSFS_ROOT */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+ SKIP(free(handle); close(ns_fd);
+ return,
+ "open_by_handle_at with FD_NSFS_ROOT not supported");
+ }
+ ASSERT_GE(fd, 0);
+
+ /* Verify we opened the correct namespace */
+ ASSERT_EQ(fstat(ns_fd, &st1), 0);
+ ASSERT_EQ(fstat(fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ close(fd);
+ close(ns_fd);
+ free(handle);
+}
+
+TEST(nsfs_cgroup_handle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ struct stat st1, st2;
+
+ /* Drop to unprivileged uid/gid */
+ ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+ ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Open cgroup namespace file descriptor */
+ ns_fd = open("/proc/self/ns/cgroup", O_RDONLY);
+ if (ns_fd < 0) {
+ SKIP(free(handle); return, "cgroup namespace not available");
+ }
+
+ /* Get handle for the namespace */
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ ASSERT_GT(handle->handle_bytes, 0);
+
+ /* Try to open using FD_NSFS_ROOT */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+ SKIP(free(handle); close(ns_fd);
+ return,
+ "open_by_handle_at with FD_NSFS_ROOT not supported");
+ }
+ ASSERT_GE(fd, 0);
+
+ /* Verify we opened the correct namespace */
+ ASSERT_EQ(fstat(ns_fd, &st1), 0);
+ ASSERT_EQ(fstat(fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ close(fd);
+ close(ns_fd);
+ free(handle);
+}
+
+TEST(nsfs_time_handle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ struct stat st1, st2;
+
+ /* Drop to unprivileged uid/gid */
+ ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+ ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Open time namespace file descriptor */
+ ns_fd = open("/proc/self/ns/time", O_RDONLY);
+ if (ns_fd < 0) {
+ SKIP(free(handle); return, "time namespace not available");
+ }
+
+ /* Get handle for the namespace */
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ ASSERT_GT(handle->handle_bytes, 0);
+
+ /* Try to open using FD_NSFS_ROOT */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+ SKIP(free(handle); close(ns_fd);
+ return,
+ "open_by_handle_at with FD_NSFS_ROOT not supported");
+ }
+ ASSERT_GE(fd, 0);
+
+ /* Verify we opened the correct namespace */
+ ASSERT_EQ(fstat(ns_fd, &st1), 0);
+ ASSERT_EQ(fstat(fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ close(fd);
+ close(ns_fd);
+ free(handle);
+}
+
+TEST(nsfs_user_net_namespace_isolation)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ pid_t pid;
+ int status;
+ int pipefd[2];
+ char result;
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Create pipe for communication */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ /* Get handle for current network namespace */
+ ns_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(ns_fd, 0);
+
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+ close(pipefd[1]);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ close(ns_fd);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* First create new user namespace to drop privileges */
+ ret = unshare(CLONE_NEWUSER);
+ if (ret < 0) {
+ write(pipefd[1], "U",
+ 1); /* Unable to create user namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Write uid/gid mappings to maintain some capabilities */
+ int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+ int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+ int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+ if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+ write(pipefd[1], "M", 1); /* Unable to set mappings */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Disable setgroups to allow gid mapping */
+ write(setgroups_fd, "deny", 4);
+ close(setgroups_fd);
+
+ /* Map current uid/gid to root in the new namespace */
+ char mapping[64];
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+ write(uid_map_fd, mapping, strlen(mapping));
+ close(uid_map_fd);
+
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+ write(gid_map_fd, mapping, strlen(mapping));
+ close(gid_map_fd);
+
+ /* Now create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ write(pipefd[1], "N",
+ 1); /* Unable to create network namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Try to open parent's network namespace handle from new user+net namespace */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+ if (fd >= 0) {
+ /* Should NOT succeed - we're in a different user namespace */
+ write(pipefd[1], "S", 1); /* Unexpected success */
+ close(fd);
+ } else if (errno == ESTALE) {
+ /* Expected: Stale file handle */
+ write(pipefd[1], "P", 1);
+ } else {
+ /* Other error */
+ write(pipefd[1], "F", 1);
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+ ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ if (result == 'U') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new user namespace");
+ }
+ if (result == 'M') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot set uid/gid mappings");
+ }
+ if (result == 'N') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new network namespace");
+ }
+
+ /* Should fail with permission denied since we're in a different user namespace */
+ ASSERT_EQ(result, 'P');
+
+ close(pipefd[0]);
+ free(handle);
+}
+
+TEST(nsfs_user_uts_namespace_isolation)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ pid_t pid;
+ int status;
+ int pipefd[2];
+ char result;
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Create pipe for communication */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ /* Get handle for current UTS namespace */
+ ns_fd = open("/proc/self/ns/uts", O_RDONLY);
+ ASSERT_GE(ns_fd, 0);
+
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+ close(pipefd[1]);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ close(ns_fd);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* First create new user namespace to drop privileges */
+ ret = unshare(CLONE_NEWUSER);
+ if (ret < 0) {
+ write(pipefd[1], "U",
+ 1); /* Unable to create user namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Write uid/gid mappings to maintain some capabilities */
+ int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+ int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+ int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+ if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+ write(pipefd[1], "M", 1); /* Unable to set mappings */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Disable setgroups to allow gid mapping */
+ write(setgroups_fd, "deny", 4);
+ close(setgroups_fd);
+
+ /* Map current uid/gid to root in the new namespace */
+ char mapping[64];
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+ write(uid_map_fd, mapping, strlen(mapping));
+ close(uid_map_fd);
+
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+ write(gid_map_fd, mapping, strlen(mapping));
+ close(gid_map_fd);
+
+ /* Now create new UTS namespace */
+ ret = unshare(CLONE_NEWUTS);
+ if (ret < 0) {
+ write(pipefd[1], "N",
+ 1); /* Unable to create UTS namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Try to open parent's UTS namespace handle from new user+uts namespace */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+ if (fd >= 0) {
+ /* Should NOT succeed - we're in a different user namespace */
+ write(pipefd[1], "S", 1); /* Unexpected success */
+ close(fd);
+ } else if (errno == ESTALE) {
+ /* Expected: Stale file handle */
+ write(pipefd[1], "P", 1);
+ } else {
+ /* Other error */
+ write(pipefd[1], "F", 1);
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+ ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ if (result == 'U') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new user namespace");
+ }
+ if (result == 'M') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot set uid/gid mappings");
+ }
+ if (result == 'N') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new UTS namespace");
+ }
+
+ /* Should fail with ESTALE since we're in a different user namespace */
+ ASSERT_EQ(result, 'P');
+
+ close(pipefd[0]);
+ free(handle);
+}
+
+TEST(nsfs_user_ipc_namespace_isolation)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ pid_t pid;
+ int status;
+ int pipefd[2];
+ char result;
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Create pipe for communication */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ /* Get handle for current IPC namespace */
+ ns_fd = open("/proc/self/ns/ipc", O_RDONLY);
+ ASSERT_GE(ns_fd, 0);
+
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+ close(pipefd[1]);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ close(ns_fd);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* First create new user namespace to drop privileges */
+ ret = unshare(CLONE_NEWUSER);
+ if (ret < 0) {
+ write(pipefd[1], "U",
+ 1); /* Unable to create user namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Write uid/gid mappings to maintain some capabilities */
+ int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+ int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+ int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+ if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+ write(pipefd[1], "M", 1); /* Unable to set mappings */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Disable setgroups to allow gid mapping */
+ write(setgroups_fd, "deny", 4);
+ close(setgroups_fd);
+
+ /* Map current uid/gid to root in the new namespace */
+ char mapping[64];
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+ write(uid_map_fd, mapping, strlen(mapping));
+ close(uid_map_fd);
+
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+ write(gid_map_fd, mapping, strlen(mapping));
+ close(gid_map_fd);
+
+ /* Now create new IPC namespace */
+ ret = unshare(CLONE_NEWIPC);
+ if (ret < 0) {
+ write(pipefd[1], "N",
+ 1); /* Unable to create IPC namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Try to open parent's IPC namespace handle from new user+ipc namespace */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+ if (fd >= 0) {
+ /* Should NOT succeed - we're in a different user namespace */
+ write(pipefd[1], "S", 1); /* Unexpected success */
+ close(fd);
+ } else if (errno == ESTALE) {
+ /* Expected: Stale file handle */
+ write(pipefd[1], "P", 1);
+ } else {
+ /* Other error */
+ write(pipefd[1], "F", 1);
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+ ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ if (result == 'U') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new user namespace");
+ }
+ if (result == 'M') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot set uid/gid mappings");
+ }
+ if (result == 'N') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new IPC namespace");
+ }
+
+ /* Should fail with ESTALE since we're in a different user namespace */
+ ASSERT_EQ(result, 'P');
+
+ close(pipefd[0]);
+ free(handle);
+}
+
+TEST(nsfs_user_mnt_namespace_isolation)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ pid_t pid;
+ int status;
+ int pipefd[2];
+ char result;
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Create pipe for communication */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ /* Get handle for current mount namespace */
+ ns_fd = open("/proc/self/ns/mnt", O_RDONLY);
+ ASSERT_GE(ns_fd, 0);
+
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+ close(pipefd[1]);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ close(ns_fd);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* First create new user namespace to drop privileges */
+ ret = unshare(CLONE_NEWUSER);
+ if (ret < 0) {
+ write(pipefd[1], "U",
+ 1); /* Unable to create user namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Write uid/gid mappings to maintain some capabilities */
+ int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+ int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+ int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+ if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+ write(pipefd[1], "M", 1); /* Unable to set mappings */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Disable setgroups to allow gid mapping */
+ write(setgroups_fd, "deny", 4);
+ close(setgroups_fd);
+
+ /* Map current uid/gid to root in the new namespace */
+ char mapping[64];
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+ write(uid_map_fd, mapping, strlen(mapping));
+ close(uid_map_fd);
+
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+ write(gid_map_fd, mapping, strlen(mapping));
+ close(gid_map_fd);
+
+ /* Now create new mount namespace */
+ ret = unshare(CLONE_NEWNS);
+ if (ret < 0) {
+ write(pipefd[1], "N",
+ 1); /* Unable to create mount namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Try to open parent's mount namespace handle from new user+mnt namespace */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+ if (fd >= 0) {
+ /* Should NOT succeed - we're in a different user namespace */
+ write(pipefd[1], "S", 1); /* Unexpected success */
+ close(fd);
+ } else if (errno == ESTALE) {
+ /* Expected: Stale file handle */
+ write(pipefd[1], "P", 1);
+ } else {
+ /* Other error */
+ write(pipefd[1], "F", 1);
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+ ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ if (result == 'U') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new user namespace");
+ }
+ if (result == 'M') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot set uid/gid mappings");
+ }
+ if (result == 'N') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new mount namespace");
+ }
+
+ /* Should fail with ESTALE since we're in a different user namespace */
+ ASSERT_EQ(result, 'P');
+
+ close(pipefd[0]);
+ free(handle);
+}
+
+TEST(nsfs_user_cgroup_namespace_isolation)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ pid_t pid;
+ int status;
+ int pipefd[2];
+ char result;
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Create pipe for communication */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ /* Get handle for current cgroup namespace */
+ ns_fd = open("/proc/self/ns/cgroup", O_RDONLY);
+ if (ns_fd < 0) {
+ SKIP(free(handle); close(pipefd[0]); close(pipefd[1]);
+ return, "cgroup namespace not available");
+ }
+
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+ close(pipefd[1]);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ close(ns_fd);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* First create new user namespace to drop privileges */
+ ret = unshare(CLONE_NEWUSER);
+ if (ret < 0) {
+ write(pipefd[1], "U",
+ 1); /* Unable to create user namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Write uid/gid mappings to maintain some capabilities */
+ int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+ int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+ int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+ if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+ write(pipefd[1], "M", 1); /* Unable to set mappings */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Disable setgroups to allow gid mapping */
+ write(setgroups_fd, "deny", 4);
+ close(setgroups_fd);
+
+ /* Map current uid/gid to root in the new namespace */
+ char mapping[64];
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+ write(uid_map_fd, mapping, strlen(mapping));
+ close(uid_map_fd);
+
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+ write(gid_map_fd, mapping, strlen(mapping));
+ close(gid_map_fd);
+
+ /* Now create new cgroup namespace */
+ ret = unshare(CLONE_NEWCGROUP);
+ if (ret < 0) {
+ write(pipefd[1], "N",
+ 1); /* Unable to create cgroup namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Try to open parent's cgroup namespace handle from new user+cgroup namespace */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+ if (fd >= 0) {
+ /* Should NOT succeed - we're in a different user namespace */
+ write(pipefd[1], "S", 1); /* Unexpected success */
+ close(fd);
+ } else if (errno == ESTALE) {
+ /* Expected: Stale file handle */
+ write(pipefd[1], "P", 1);
+ } else {
+ /* Other error */
+ write(pipefd[1], "F", 1);
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+ ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ if (result == 'U') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new user namespace");
+ }
+ if (result == 'M') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot set uid/gid mappings");
+ }
+ if (result == 'N') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new cgroup namespace");
+ }
+
+ /* Should fail with ESTALE since we're in a different user namespace */
+ ASSERT_EQ(result, 'P');
+
+ close(pipefd[0]);
+ free(handle);
+}
+
+TEST(nsfs_user_pid_namespace_isolation)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ pid_t pid;
+ int status;
+ int pipefd[2];
+ char result;
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Create pipe for communication */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ /* Get handle for current PID namespace */
+ ns_fd = open("/proc/self/ns/pid", O_RDONLY);
+ ASSERT_GE(ns_fd, 0);
+
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+ close(pipefd[1]);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ close(ns_fd);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* First create new user namespace to drop privileges */
+ ret = unshare(CLONE_NEWUSER);
+ if (ret < 0) {
+ write(pipefd[1], "U",
+ 1); /* Unable to create user namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Write uid/gid mappings to maintain some capabilities */
+ int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+ int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+ int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+ if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+ write(pipefd[1], "M", 1); /* Unable to set mappings */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Disable setgroups to allow gid mapping */
+ write(setgroups_fd, "deny", 4);
+ close(setgroups_fd);
+
+ /* Map current uid/gid to root in the new namespace */
+ char mapping[64];
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+ write(uid_map_fd, mapping, strlen(mapping));
+ close(uid_map_fd);
+
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+ write(gid_map_fd, mapping, strlen(mapping));
+ close(gid_map_fd);
+
+ /* Now create new PID namespace - requires fork to take effect */
+ ret = unshare(CLONE_NEWPID);
+ if (ret < 0) {
+ write(pipefd[1], "N",
+ 1); /* Unable to create PID namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Fork again for PID namespace to take effect */
+ pid_t child_pid = fork();
+ if (child_pid < 0) {
+ write(pipefd[1], "N",
+ 1); /* Unable to fork in PID namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ if (child_pid == 0) {
+ /* Grandchild in new PID namespace */
+ /* Try to open parent's PID namespace handle from new user+pid namespace */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+ if (fd >= 0) {
+ /* Should NOT succeed - we're in a different user namespace */
+ write(pipefd[1], "S",
+ 1); /* Unexpected success */
+ close(fd);
+ } else if (errno == ESTALE) {
+ /* Expected: Stale file handle */
+ write(pipefd[1], "P", 1);
+ } else {
+ /* Other error */
+ write(pipefd[1], "F", 1);
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Wait for grandchild */
+ waitpid(child_pid, NULL, 0);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+ ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ if (result == 'U') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new user namespace");
+ }
+ if (result == 'M') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot set uid/gid mappings");
+ }
+ if (result == 'N') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new PID namespace");
+ }
+
+ /* Should fail with ESTALE since we're in a different user namespace */
+ ASSERT_EQ(result, 'P');
+
+ close(pipefd[0]);
+ free(handle);
+}
+
+TEST(nsfs_user_time_namespace_isolation)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+ pid_t pid;
+ int status;
+ int pipefd[2];
+ char result;
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Create pipe for communication */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ /* Get handle for current time namespace */
+ ns_fd = open("/proc/self/ns/time", O_RDONLY);
+ if (ns_fd < 0) {
+ SKIP(free(handle); close(pipefd[0]); close(pipefd[1]);
+ return, "time namespace not available");
+ }
+
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+ close(pipefd[1]);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ close(ns_fd);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* First create new user namespace to drop privileges */
+ ret = unshare(CLONE_NEWUSER);
+ if (ret < 0) {
+ write(pipefd[1], "U",
+ 1); /* Unable to create user namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Write uid/gid mappings to maintain some capabilities */
+ int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+ int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+ int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+ if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+ write(pipefd[1], "M", 1); /* Unable to set mappings */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Disable setgroups to allow gid mapping */
+ write(setgroups_fd, "deny", 4);
+ close(setgroups_fd);
+
+ /* Map current uid/gid to root in the new namespace */
+ char mapping[64];
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+ write(uid_map_fd, mapping, strlen(mapping));
+ close(uid_map_fd);
+
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+ write(gid_map_fd, mapping, strlen(mapping));
+ close(gid_map_fd);
+
+ /* Now create new time namespace - requires fork to take effect */
+ ret = unshare(CLONE_NEWTIME);
+ if (ret < 0) {
+ write(pipefd[1], "N",
+ 1); /* Unable to create time namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Fork again for time namespace to take effect */
+ pid_t child_pid = fork();
+ if (child_pid < 0) {
+ write(pipefd[1], "N",
+ 1); /* Unable to fork in time namespace */
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ if (child_pid == 0) {
+ /* Grandchild in new time namespace */
+ /* Try to open parent's time namespace handle from new user+time namespace */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+ if (fd >= 0) {
+ /* Should NOT succeed - we're in a different user namespace */
+ write(pipefd[1], "S",
+ 1); /* Unexpected success */
+ close(fd);
+ } else if (errno == ESTALE) {
+ /* Expected: Stale file handle */
+ write(pipefd[1], "P", 1);
+ } else {
+ /* Other error */
+ write(pipefd[1], "F", 1);
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Wait for grandchild */
+ waitpid(child_pid, NULL, 0);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+ ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ if (result == 'U') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new user namespace");
+ }
+ if (result == 'M') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot set uid/gid mappings");
+ }
+ if (result == 'N') {
+ SKIP(free(handle); close(pipefd[0]);
+ return, "Cannot create new time namespace");
+ }
+
+ /* Should fail with ESTALE since we're in a different user namespace */
+ ASSERT_EQ(result, 'P');
+
+ close(pipefd[0]);
+ free(handle);
+}
+
+TEST(nsfs_open_flags)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int ns_fd;
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Open a namespace file descriptor */
+ ns_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(ns_fd, 0);
+
+ /* Get handle for the namespace */
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(ns_fd);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+ ASSERT_GT(handle->handle_bytes, 0);
+
+ /* Test invalid flags that should fail */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_WRONLY);
+ ASSERT_LT(fd, 0);
+ ASSERT_EQ(errno, EPERM);
+
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDWR);
+ ASSERT_LT(fd, 0);
+ ASSERT_EQ(errno, EPERM);
+
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TRUNC);
+ ASSERT_LT(fd, 0);
+ ASSERT_EQ(errno, EPERM);
+
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECT);
+ ASSERT_LT(fd, 0);
+ ASSERT_EQ(errno, EINVAL);
+
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TMPFILE);
+ ASSERT_LT(fd, 0);
+ ASSERT_EQ(errno, EINVAL);
+
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECTORY);
+ ASSERT_LT(fd, 0);
+ ASSERT_EQ(errno, ENOTDIR);
+
+ close(ns_fd);
+ free(handle);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/init_ino_test.c b/tools/testing/selftests/namespaces/init_ino_test.c
new file mode 100644
index 000000000000..5b6993c3740b
--- /dev/null
+++ b/tools/testing/selftests/namespaces/init_ino_test.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2025 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <linux/nsfs.h>
+
+#include "../kselftest_harness.h"
+
+struct ns_info {
+ const char *name;
+ const char *proc_path;
+ unsigned int expected_ino;
+};
+
+static struct ns_info namespaces[] = {
+ { "ipc", "/proc/1/ns/ipc", IPC_NS_INIT_INO },
+ { "uts", "/proc/1/ns/uts", UTS_NS_INIT_INO },
+ { "user", "/proc/1/ns/user", USER_NS_INIT_INO },
+ { "pid", "/proc/1/ns/pid", PID_NS_INIT_INO },
+ { "cgroup", "/proc/1/ns/cgroup", CGROUP_NS_INIT_INO },
+ { "time", "/proc/1/ns/time", TIME_NS_INIT_INO },
+ { "net", "/proc/1/ns/net", NET_NS_INIT_INO },
+ { "mnt", "/proc/1/ns/mnt", MNT_NS_INIT_INO },
+};
+
+TEST(init_namespace_inodes)
+{
+ struct stat st;
+
+ for (int i = 0; i < sizeof(namespaces) / sizeof(namespaces[0]); i++) {
+ int ret = stat(namespaces[i].proc_path, &st);
+
+ /* Some namespaces might not be available (e.g., time namespace on older kernels) */
+ if (ret < 0) {
+ if (errno == ENOENT) {
+ ksft_test_result_skip("%s namespace not available\n",
+ namespaces[i].name);
+ continue;
+ }
+ ASSERT_GE(ret, 0)
+ TH_LOG("Failed to stat %s: %s",
+ namespaces[i].proc_path, strerror(errno));
+ }
+
+ ASSERT_EQ(st.st_ino, namespaces[i].expected_ino)
+ TH_LOG("Namespace %s has inode 0x%lx, expected 0x%x",
+ namespaces[i].name, st.st_ino, namespaces[i].expected_ino);
+
+ ksft_print_msg("Namespace %s: inode 0x%lx matches expected 0x%x\n",
+ namespaces[i].name, st.st_ino, namespaces[i].expected_ino);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c
new file mode 100644
index 000000000000..e28accd74a57
--- /dev/null
+++ b/tools/testing/selftests/namespaces/nsid_test.c
@@ -0,0 +1,986 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <assert.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <libgen.h>
+#include <limits.h>
+#include <pthread.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <poll.h>
+#include <sys/epoll.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+#include <linux/fs.h>
+#include <linux/limits.h>
+#include <linux/nsfs.h>
+#include "../kselftest_harness.h"
+
+TEST(nsid_mntns_basic)
+{
+ __u64 mnt_ns_id = 0;
+ int fd_mntns;
+ int ret;
+
+ /* Open the current mount namespace */
+ fd_mntns = open("/proc/self/ns/mnt", O_RDONLY);
+ ASSERT_GE(fd_mntns, 0);
+
+ /* Get the mount namespace ID */
+ ret = ioctl(fd_mntns, NS_GET_MNTNS_ID, &mnt_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(mnt_ns_id, 0);
+
+ /* Verify we can get the same ID again */
+ __u64 mnt_ns_id2 = 0;
+ ret = ioctl(fd_mntns, NS_GET_ID, &mnt_ns_id2);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(mnt_ns_id, mnt_ns_id2);
+
+ close(fd_mntns);
+}
+
+TEST(nsid_mntns_separate)
+{
+ __u64 parent_mnt_ns_id = 0;
+ __u64 child_mnt_ns_id = 0;
+ int fd_parent_mntns, fd_child_mntns;
+ int ret;
+ pid_t pid;
+ int pipefd[2];
+
+ /* Get parent's mount namespace ID */
+ fd_parent_mntns = open("/proc/self/ns/mnt", O_RDONLY);
+ ASSERT_GE(fd_parent_mntns, 0);
+ ret = ioctl(fd_parent_mntns, NS_GET_ID, &parent_mnt_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(parent_mnt_ns_id, 0);
+
+ /* Create a pipe for synchronization */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new mount namespace */
+ ret = unshare(CLONE_NEWNS);
+ if (ret != 0) {
+ /* Skip test if we don't have permission */
+ if (errno == EPERM || errno == EACCES) {
+ write(pipefd[1], "S", 1); /* Signal skip */
+ _exit(0);
+ }
+ _exit(1);
+ }
+
+ /* Signal success */
+ write(pipefd[1], "Y", 1);
+ close(pipefd[1]);
+
+ /* Keep namespace alive */
+ pause();
+ _exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ char buf;
+ ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+ close(pipefd[0]);
+
+ if (buf == 'S') {
+ /* Child couldn't create namespace, skip test */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+ close(fd_parent_mntns);
+ SKIP(return, "No permission to create mount namespace");
+ }
+
+ ASSERT_EQ(buf, 'Y');
+
+ /* Open child's mount namespace */
+ char path[256];
+ snprintf(path, sizeof(path), "/proc/%d/ns/mnt", pid);
+ fd_child_mntns = open(path, O_RDONLY);
+ ASSERT_GE(fd_child_mntns, 0);
+
+ /* Get child's mount namespace ID */
+ ret = ioctl(fd_child_mntns, NS_GET_ID, &child_mnt_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(child_mnt_ns_id, 0);
+
+ /* Parent and child should have different mount namespace IDs */
+ ASSERT_NE(parent_mnt_ns_id, child_mnt_ns_id);
+
+ close(fd_parent_mntns);
+ close(fd_child_mntns);
+
+ /* Clean up child process */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_cgroupns_basic)
+{
+ __u64 cgroup_ns_id = 0;
+ int fd_cgroupns;
+ int ret;
+
+ /* Open the current cgroup namespace */
+ fd_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY);
+ ASSERT_GE(fd_cgroupns, 0);
+
+ /* Get the cgroup namespace ID */
+ ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(cgroup_ns_id, 0);
+
+ /* Verify we can get the same ID again */
+ __u64 cgroup_ns_id2 = 0;
+ ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id2);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(cgroup_ns_id, cgroup_ns_id2);
+
+ close(fd_cgroupns);
+}
+
+TEST(nsid_cgroupns_separate)
+{
+ __u64 parent_cgroup_ns_id = 0;
+ __u64 child_cgroup_ns_id = 0;
+ int fd_parent_cgroupns, fd_child_cgroupns;
+ int ret;
+ pid_t pid;
+ int pipefd[2];
+
+ /* Get parent's cgroup namespace ID */
+ fd_parent_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY);
+ ASSERT_GE(fd_parent_cgroupns, 0);
+ ret = ioctl(fd_parent_cgroupns, NS_GET_ID, &parent_cgroup_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(parent_cgroup_ns_id, 0);
+
+ /* Create a pipe for synchronization */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new cgroup namespace */
+ ret = unshare(CLONE_NEWCGROUP);
+ if (ret != 0) {
+ /* Skip test if we don't have permission */
+ if (errno == EPERM || errno == EACCES) {
+ write(pipefd[1], "S", 1); /* Signal skip */
+ _exit(0);
+ }
+ _exit(1);
+ }
+
+ /* Signal success */
+ write(pipefd[1], "Y", 1);
+ close(pipefd[1]);
+
+ /* Keep namespace alive */
+ pause();
+ _exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ char buf;
+ ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+ close(pipefd[0]);
+
+ if (buf == 'S') {
+ /* Child couldn't create namespace, skip test */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+ close(fd_parent_cgroupns);
+ SKIP(return, "No permission to create cgroup namespace");
+ }
+
+ ASSERT_EQ(buf, 'Y');
+
+ /* Open child's cgroup namespace */
+ char path[256];
+ snprintf(path, sizeof(path), "/proc/%d/ns/cgroup", pid);
+ fd_child_cgroupns = open(path, O_RDONLY);
+ ASSERT_GE(fd_child_cgroupns, 0);
+
+ /* Get child's cgroup namespace ID */
+ ret = ioctl(fd_child_cgroupns, NS_GET_ID, &child_cgroup_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(child_cgroup_ns_id, 0);
+
+ /* Parent and child should have different cgroup namespace IDs */
+ ASSERT_NE(parent_cgroup_ns_id, child_cgroup_ns_id);
+
+ close(fd_parent_cgroupns);
+ close(fd_child_cgroupns);
+
+ /* Clean up child process */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_ipcns_basic)
+{
+ __u64 ipc_ns_id = 0;
+ int fd_ipcns;
+ int ret;
+
+ /* Open the current IPC namespace */
+ fd_ipcns = open("/proc/self/ns/ipc", O_RDONLY);
+ ASSERT_GE(fd_ipcns, 0);
+
+ /* Get the IPC namespace ID */
+ ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(ipc_ns_id, 0);
+
+ /* Verify we can get the same ID again */
+ __u64 ipc_ns_id2 = 0;
+ ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id2);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(ipc_ns_id, ipc_ns_id2);
+
+ close(fd_ipcns);
+}
+
+TEST(nsid_ipcns_separate)
+{
+ __u64 parent_ipc_ns_id = 0;
+ __u64 child_ipc_ns_id = 0;
+ int fd_parent_ipcns, fd_child_ipcns;
+ int ret;
+ pid_t pid;
+ int pipefd[2];
+
+ /* Get parent's IPC namespace ID */
+ fd_parent_ipcns = open("/proc/self/ns/ipc", O_RDONLY);
+ ASSERT_GE(fd_parent_ipcns, 0);
+ ret = ioctl(fd_parent_ipcns, NS_GET_ID, &parent_ipc_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(parent_ipc_ns_id, 0);
+
+ /* Create a pipe for synchronization */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new IPC namespace */
+ ret = unshare(CLONE_NEWIPC);
+ if (ret != 0) {
+ /* Skip test if we don't have permission */
+ if (errno == EPERM || errno == EACCES) {
+ write(pipefd[1], "S", 1); /* Signal skip */
+ _exit(0);
+ }
+ _exit(1);
+ }
+
+ /* Signal success */
+ write(pipefd[1], "Y", 1);
+ close(pipefd[1]);
+
+ /* Keep namespace alive */
+ pause();
+ _exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ char buf;
+ ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+ close(pipefd[0]);
+
+ if (buf == 'S') {
+ /* Child couldn't create namespace, skip test */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+ close(fd_parent_ipcns);
+ SKIP(return, "No permission to create IPC namespace");
+ }
+
+ ASSERT_EQ(buf, 'Y');
+
+ /* Open child's IPC namespace */
+ char path[256];
+ snprintf(path, sizeof(path), "/proc/%d/ns/ipc", pid);
+ fd_child_ipcns = open(path, O_RDONLY);
+ ASSERT_GE(fd_child_ipcns, 0);
+
+ /* Get child's IPC namespace ID */
+ ret = ioctl(fd_child_ipcns, NS_GET_ID, &child_ipc_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(child_ipc_ns_id, 0);
+
+ /* Parent and child should have different IPC namespace IDs */
+ ASSERT_NE(parent_ipc_ns_id, child_ipc_ns_id);
+
+ close(fd_parent_ipcns);
+ close(fd_child_ipcns);
+
+ /* Clean up child process */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_utsns_basic)
+{
+ __u64 uts_ns_id = 0;
+ int fd_utsns;
+ int ret;
+
+ /* Open the current UTS namespace */
+ fd_utsns = open("/proc/self/ns/uts", O_RDONLY);
+ ASSERT_GE(fd_utsns, 0);
+
+ /* Get the UTS namespace ID */
+ ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(uts_ns_id, 0);
+
+ /* Verify we can get the same ID again */
+ __u64 uts_ns_id2 = 0;
+ ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id2);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(uts_ns_id, uts_ns_id2);
+
+ close(fd_utsns);
+}
+
+TEST(nsid_utsns_separate)
+{
+ __u64 parent_uts_ns_id = 0;
+ __u64 child_uts_ns_id = 0;
+ int fd_parent_utsns, fd_child_utsns;
+ int ret;
+ pid_t pid;
+ int pipefd[2];
+
+ /* Get parent's UTS namespace ID */
+ fd_parent_utsns = open("/proc/self/ns/uts", O_RDONLY);
+ ASSERT_GE(fd_parent_utsns, 0);
+ ret = ioctl(fd_parent_utsns, NS_GET_ID, &parent_uts_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(parent_uts_ns_id, 0);
+
+ /* Create a pipe for synchronization */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new UTS namespace */
+ ret = unshare(CLONE_NEWUTS);
+ if (ret != 0) {
+ /* Skip test if we don't have permission */
+ if (errno == EPERM || errno == EACCES) {
+ write(pipefd[1], "S", 1); /* Signal skip */
+ _exit(0);
+ }
+ _exit(1);
+ }
+
+ /* Signal success */
+ write(pipefd[1], "Y", 1);
+ close(pipefd[1]);
+
+ /* Keep namespace alive */
+ pause();
+ _exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ char buf;
+ ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+ close(pipefd[0]);
+
+ if (buf == 'S') {
+ /* Child couldn't create namespace, skip test */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+ close(fd_parent_utsns);
+ SKIP(return, "No permission to create UTS namespace");
+ }
+
+ ASSERT_EQ(buf, 'Y');
+
+ /* Open child's UTS namespace */
+ char path[256];
+ snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid);
+ fd_child_utsns = open(path, O_RDONLY);
+ ASSERT_GE(fd_child_utsns, 0);
+
+ /* Get child's UTS namespace ID */
+ ret = ioctl(fd_child_utsns, NS_GET_ID, &child_uts_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(child_uts_ns_id, 0);
+
+ /* Parent and child should have different UTS namespace IDs */
+ ASSERT_NE(parent_uts_ns_id, child_uts_ns_id);
+
+ close(fd_parent_utsns);
+ close(fd_child_utsns);
+
+ /* Clean up child process */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_userns_basic)
+{
+ __u64 user_ns_id = 0;
+ int fd_userns;
+ int ret;
+
+ /* Open the current user namespace */
+ fd_userns = open("/proc/self/ns/user", O_RDONLY);
+ ASSERT_GE(fd_userns, 0);
+
+ /* Get the user namespace ID */
+ ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(user_ns_id, 0);
+
+ /* Verify we can get the same ID again */
+ __u64 user_ns_id2 = 0;
+ ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id2);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(user_ns_id, user_ns_id2);
+
+ close(fd_userns);
+}
+
+TEST(nsid_userns_separate)
+{
+ __u64 parent_user_ns_id = 0;
+ __u64 child_user_ns_id = 0;
+ int fd_parent_userns, fd_child_userns;
+ int ret;
+ pid_t pid;
+ int pipefd[2];
+
+ /* Get parent's user namespace ID */
+ fd_parent_userns = open("/proc/self/ns/user", O_RDONLY);
+ ASSERT_GE(fd_parent_userns, 0);
+ ret = ioctl(fd_parent_userns, NS_GET_ID, &parent_user_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(parent_user_ns_id, 0);
+
+ /* Create a pipe for synchronization */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new user namespace */
+ ret = unshare(CLONE_NEWUSER);
+ if (ret != 0) {
+ /* Skip test if we don't have permission */
+ if (errno == EPERM || errno == EACCES) {
+ write(pipefd[1], "S", 1); /* Signal skip */
+ _exit(0);
+ }
+ _exit(1);
+ }
+
+ /* Signal success */
+ write(pipefd[1], "Y", 1);
+ close(pipefd[1]);
+
+ /* Keep namespace alive */
+ pause();
+ _exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ char buf;
+ ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+ close(pipefd[0]);
+
+ if (buf == 'S') {
+ /* Child couldn't create namespace, skip test */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+ close(fd_parent_userns);
+ SKIP(return, "No permission to create user namespace");
+ }
+
+ ASSERT_EQ(buf, 'Y');
+
+ /* Open child's user namespace */
+ char path[256];
+ snprintf(path, sizeof(path), "/proc/%d/ns/user", pid);
+ fd_child_userns = open(path, O_RDONLY);
+ ASSERT_GE(fd_child_userns, 0);
+
+ /* Get child's user namespace ID */
+ ret = ioctl(fd_child_userns, NS_GET_ID, &child_user_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(child_user_ns_id, 0);
+
+ /* Parent and child should have different user namespace IDs */
+ ASSERT_NE(parent_user_ns_id, child_user_ns_id);
+
+ close(fd_parent_userns);
+ close(fd_child_userns);
+
+ /* Clean up child process */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_timens_basic)
+{
+ __u64 time_ns_id = 0;
+ int fd_timens;
+ int ret;
+
+ /* Open the current time namespace */
+ fd_timens = open("/proc/self/ns/time", O_RDONLY);
+ if (fd_timens < 0) {
+ SKIP(return, "Time namespaces not supported");
+ }
+
+ /* Get the time namespace ID */
+ ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(time_ns_id, 0);
+
+ /* Verify we can get the same ID again */
+ __u64 time_ns_id2 = 0;
+ ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id2);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(time_ns_id, time_ns_id2);
+
+ close(fd_timens);
+}
+
+TEST(nsid_timens_separate)
+{
+ __u64 parent_time_ns_id = 0;
+ __u64 child_time_ns_id = 0;
+ int fd_parent_timens, fd_child_timens;
+ int ret;
+ pid_t pid;
+ int pipefd[2];
+
+ /* Open the current time namespace */
+ fd_parent_timens = open("/proc/self/ns/time", O_RDONLY);
+ if (fd_parent_timens < 0) {
+ SKIP(return, "Time namespaces not supported");
+ }
+
+ /* Get parent's time namespace ID */
+ ret = ioctl(fd_parent_timens, NS_GET_ID, &parent_time_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(parent_time_ns_id, 0);
+
+ /* Create a pipe for synchronization */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new time namespace */
+ ret = unshare(CLONE_NEWTIME);
+ if (ret != 0) {
+ /* Skip test if we don't have permission */
+ if (errno == EPERM || errno == EACCES || errno == EINVAL) {
+ write(pipefd[1], "S", 1); /* Signal skip */
+ _exit(0);
+ }
+ _exit(1);
+ }
+
+ /* Fork a grandchild to actually enter the new namespace */
+ pid_t grandchild = fork();
+ if (grandchild == 0) {
+ /* Grandchild is in the new namespace */
+ write(pipefd[1], "Y", 1);
+ close(pipefd[1]);
+ pause();
+ _exit(0);
+ } else if (grandchild > 0) {
+ /* Child writes grandchild PID and waits */
+ write(pipefd[1], "Y", 1);
+ write(pipefd[1], &grandchild, sizeof(grandchild));
+ close(pipefd[1]);
+ pause(); /* Keep the parent alive to maintain the grandchild */
+ _exit(0);
+ } else {
+ _exit(1);
+ }
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ char buf;
+ ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+
+ if (buf == 'S') {
+ /* Child couldn't create namespace, skip test */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+ close(fd_parent_timens);
+ close(pipefd[0]);
+ SKIP(return, "Cannot create time namespace");
+ }
+
+ ASSERT_EQ(buf, 'Y');
+
+ pid_t grandchild_pid;
+ ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid));
+ close(pipefd[0]);
+
+ /* Open grandchild's time namespace */
+ char path[256];
+ snprintf(path, sizeof(path), "/proc/%d/ns/time", grandchild_pid);
+ fd_child_timens = open(path, O_RDONLY);
+ ASSERT_GE(fd_child_timens, 0);
+
+ /* Get child's time namespace ID */
+ ret = ioctl(fd_child_timens, NS_GET_ID, &child_time_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(child_time_ns_id, 0);
+
+ /* Parent and child should have different time namespace IDs */
+ ASSERT_NE(parent_time_ns_id, child_time_ns_id);
+
+ close(fd_parent_timens);
+ close(fd_child_timens);
+
+ /* Clean up child process */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_pidns_basic)
+{
+ __u64 pid_ns_id = 0;
+ int fd_pidns;
+ int ret;
+
+ /* Open the current PID namespace */
+ fd_pidns = open("/proc/self/ns/pid", O_RDONLY);
+ ASSERT_GE(fd_pidns, 0);
+
+ /* Get the PID namespace ID */
+ ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(pid_ns_id, 0);
+
+ /* Verify we can get the same ID again */
+ __u64 pid_ns_id2 = 0;
+ ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id2);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(pid_ns_id, pid_ns_id2);
+
+ close(fd_pidns);
+}
+
+TEST(nsid_pidns_separate)
+{
+ __u64 parent_pid_ns_id = 0;
+ __u64 child_pid_ns_id = 0;
+ int fd_parent_pidns, fd_child_pidns;
+ int ret;
+ pid_t pid;
+ int pipefd[2];
+
+ /* Get parent's PID namespace ID */
+ fd_parent_pidns = open("/proc/self/ns/pid", O_RDONLY);
+ ASSERT_GE(fd_parent_pidns, 0);
+ ret = ioctl(fd_parent_pidns, NS_GET_ID, &parent_pid_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(parent_pid_ns_id, 0);
+
+ /* Create a pipe for synchronization */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new PID namespace */
+ ret = unshare(CLONE_NEWPID);
+ if (ret != 0) {
+ /* Skip test if we don't have permission */
+ if (errno == EPERM || errno == EACCES) {
+ write(pipefd[1], "S", 1); /* Signal skip */
+ _exit(0);
+ }
+ _exit(1);
+ }
+
+ /* Fork a grandchild to actually enter the new namespace */
+ pid_t grandchild = fork();
+ if (grandchild == 0) {
+ /* Grandchild is in the new namespace */
+ write(pipefd[1], "Y", 1);
+ close(pipefd[1]);
+ pause();
+ _exit(0);
+ } else if (grandchild > 0) {
+ /* Child writes grandchild PID and waits */
+ write(pipefd[1], "Y", 1);
+ write(pipefd[1], &grandchild, sizeof(grandchild));
+ close(pipefd[1]);
+ pause(); /* Keep the parent alive to maintain the grandchild */
+ _exit(0);
+ } else {
+ _exit(1);
+ }
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ char buf;
+ ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+
+ if (buf == 'S') {
+ /* Child couldn't create namespace, skip test */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+ close(fd_parent_pidns);
+ close(pipefd[0]);
+ SKIP(return, "No permission to create PID namespace");
+ }
+
+ ASSERT_EQ(buf, 'Y');
+
+ pid_t grandchild_pid;
+ ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid));
+ close(pipefd[0]);
+
+ /* Open grandchild's PID namespace */
+ char path[256];
+ snprintf(path, sizeof(path), "/proc/%d/ns/pid", grandchild_pid);
+ fd_child_pidns = open(path, O_RDONLY);
+ ASSERT_GE(fd_child_pidns, 0);
+
+ /* Get child's PID namespace ID */
+ ret = ioctl(fd_child_pidns, NS_GET_ID, &child_pid_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(child_pid_ns_id, 0);
+
+ /* Parent and child should have different PID namespace IDs */
+ ASSERT_NE(parent_pid_ns_id, child_pid_ns_id);
+
+ close(fd_parent_pidns);
+ close(fd_child_pidns);
+
+ /* Clean up child process */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_netns_basic)
+{
+ __u64 net_ns_id = 0;
+ __u64 netns_cookie = 0;
+ int fd_netns;
+ int sock;
+ socklen_t optlen;
+ int ret;
+
+ /* Open the current network namespace */
+ fd_netns = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(fd_netns, 0);
+
+ /* Get the network namespace ID via ioctl */
+ ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(net_ns_id, 0);
+
+ /* Create a socket to get the SO_NETNS_COOKIE */
+ sock = socket(AF_UNIX, SOCK_STREAM, 0);
+ ASSERT_GE(sock, 0);
+
+ /* Get the network namespace cookie via socket option */
+ optlen = sizeof(netns_cookie);
+ ret = getsockopt(sock, SOL_SOCKET, SO_NETNS_COOKIE, &netns_cookie, &optlen);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(optlen, sizeof(netns_cookie));
+
+ /* The namespace ID and cookie should be identical */
+ ASSERT_EQ(net_ns_id, netns_cookie);
+
+ /* Verify we can get the same ID again */
+ __u64 net_ns_id2 = 0;
+ ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id2);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(net_ns_id, net_ns_id2);
+
+ close(sock);
+ close(fd_netns);
+}
+
+TEST(nsid_netns_separate)
+{
+ __u64 parent_net_ns_id = 0;
+ __u64 parent_netns_cookie = 0;
+ __u64 child_net_ns_id = 0;
+ __u64 child_netns_cookie = 0;
+ int fd_parent_netns, fd_child_netns;
+ int parent_sock, child_sock;
+ socklen_t optlen;
+ int ret;
+ pid_t pid;
+ int pipefd[2];
+
+ /* Get parent's network namespace ID */
+ fd_parent_netns = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(fd_parent_netns, 0);
+ ret = ioctl(fd_parent_netns, NS_GET_ID, &parent_net_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(parent_net_ns_id, 0);
+
+ /* Get parent's network namespace cookie */
+ parent_sock = socket(AF_UNIX, SOCK_STREAM, 0);
+ ASSERT_GE(parent_sock, 0);
+ optlen = sizeof(parent_netns_cookie);
+ ret = getsockopt(parent_sock, SOL_SOCKET, SO_NETNS_COOKIE, &parent_netns_cookie, &optlen);
+ ASSERT_EQ(ret, 0);
+
+ /* Verify parent's ID and cookie match */
+ ASSERT_EQ(parent_net_ns_id, parent_netns_cookie);
+
+ /* Create a pipe for synchronization */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret != 0) {
+ /* Skip test if we don't have permission */
+ if (errno == EPERM || errno == EACCES) {
+ write(pipefd[1], "S", 1); /* Signal skip */
+ _exit(0);
+ }
+ _exit(1);
+ }
+
+ /* Signal success */
+ write(pipefd[1], "Y", 1);
+ close(pipefd[1]);
+
+ /* Keep namespace alive */
+ pause();
+ _exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ char buf;
+ ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+ close(pipefd[0]);
+
+ if (buf == 'S') {
+ /* Child couldn't create namespace, skip test */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+ close(fd_parent_netns);
+ close(parent_sock);
+ SKIP(return, "No permission to create network namespace");
+ }
+
+ ASSERT_EQ(buf, 'Y');
+
+ /* Open child's network namespace */
+ char path[256];
+ snprintf(path, sizeof(path), "/proc/%d/ns/net", pid);
+ fd_child_netns = open(path, O_RDONLY);
+ ASSERT_GE(fd_child_netns, 0);
+
+ /* Get child's network namespace ID */
+ ret = ioctl(fd_child_netns, NS_GET_ID, &child_net_ns_id);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(child_net_ns_id, 0);
+
+ /* Create socket in child's namespace to get cookie */
+ ret = setns(fd_child_netns, CLONE_NEWNET);
+ if (ret == 0) {
+ child_sock = socket(AF_UNIX, SOCK_STREAM, 0);
+ ASSERT_GE(child_sock, 0);
+
+ optlen = sizeof(child_netns_cookie);
+ ret = getsockopt(child_sock, SOL_SOCKET, SO_NETNS_COOKIE, &child_netns_cookie, &optlen);
+ ASSERT_EQ(ret, 0);
+
+ /* Verify child's ID and cookie match */
+ ASSERT_EQ(child_net_ns_id, child_netns_cookie);
+
+ close(child_sock);
+
+ /* Return to parent namespace */
+ setns(fd_parent_netns, CLONE_NEWNET);
+ }
+
+ /* Parent and child should have different network namespace IDs */
+ ASSERT_NE(parent_net_ns_id, child_net_ns_id);
+ if (child_netns_cookie != 0) {
+ ASSERT_NE(parent_netns_cookie, child_netns_cookie);
+ }
+
+ close(fd_parent_netns);
+ close(fd_child_netns);
+ close(parent_sock);
+
+ /* Clean up child process */
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+}
+
+TEST_HARNESS_MAIN