summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/alpha/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/arm/tools/syscall.tbl1
-rw-r--r--arch/arm64/tools/syscall_32.tbl1
-rw-r--r--arch/m68k/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n32.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n64.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_o32.tbl1
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sh/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sparc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl1
-rw-r--r--fs/libfs.c1
-rw-r--r--fs/mount.h3
-rw-r--r--fs/namespace.c10
-rw-r--r--fs/nsfs.c101
-rw-r--r--fs/pidfs.c76
-rw-r--r--include/linux/ns/ns_common_types.h196
-rw-r--r--include/linux/ns/nstree_types.h55
-rw-r--r--include/linux/ns_common.h233
-rw-r--r--include/linux/nsfs.h3
-rw-r--r--include/linux/nsproxy.h9
-rw-r--r--include/linux/nstree.h52
-rw-r--r--include/linux/pid_namespace.h3
-rw-r--r--include/linux/pseudo_fs.h1
-rw-r--r--include/linux/syscalls.h4
-rw-r--r--include/linux/user_namespace.h4
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--include/uapi/linux/nsfs.h58
-rw-r--r--init/version-timestamp.c7
-rw-r--r--ipc/msgutil.c7
-rw-r--r--ipc/namespace.c3
-rw-r--r--kernel/cgroup/cgroup.c11
-rw-r--r--kernel/cgroup/namespace.c2
-rw-r--r--kernel/cred.c6
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/nscommon.c246
-rw-r--r--kernel/nsproxy.c57
-rw-r--r--kernel/nstree.c782
-rw-r--r--kernel/pid.c12
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/time/namespace.c5
-rw-r--r--kernel/user.c7
-rw-r--r--net/core/net_namespace.c2
-rw-r--r--scripts/syscall.tbl1
-rw-r--r--tools/include/uapi/linux/nsfs.h70
-rw-r--r--tools/testing/selftests/filesystems/utils.c2
-rw-r--r--tools/testing/selftests/namespaces/.gitignore9
-rw-r--r--tools/testing/selftests/namespaces/Makefile24
-rw-r--r--tools/testing/selftests/namespaces/cred_change_test.c814
-rw-r--r--tools/testing/selftests/namespaces/listns_efault_test.c530
-rw-r--r--tools/testing/selftests/namespaces/listns_pagination_bug.c138
-rw-r--r--tools/testing/selftests/namespaces/listns_permissions_test.c759
-rw-r--r--tools/testing/selftests/namespaces/listns_test.c679
-rw-r--r--tools/testing/selftests/namespaces/ns_active_ref_test.c2672
-rw-r--r--tools/testing/selftests/namespaces/nsid_test.c107
-rw-r--r--tools/testing/selftests/namespaces/regression_pidfd_setns_test.c113
-rw-r--r--tools/testing/selftests/namespaces/siocgskns_test.c1824
-rw-r--r--tools/testing/selftests/namespaces/stress_test.c626
-rw-r--r--tools/testing/selftests/namespaces/wrappers.h35
65 files changed, 9967 insertions, 420 deletions
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 16dca28ebf17..3fed97478058 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -509,3 +509,4 @@
577 common open_tree_attr sys_open_tree_attr
578 common file_getattr sys_file_getattr
579 common file_setattr sys_file_setattr
+580 common listns sys_listns
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index b07e699aaa3c..fd09afae72a2 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -484,3 +484,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl
index 8d9088bc577d..8cdfe5d4dac9 100644
--- a/arch/arm64/tools/syscall_32.tbl
+++ b/arch/arm64/tools/syscall_32.tbl
@@ -481,3 +481,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index f41d38dfbf13..871a5d67bf41 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -469,3 +469,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 580af574fe73..022fc85d94b3 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -475,3 +475,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index d824ffe9a014..8cedc83c3266 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -408,3 +408,4 @@
467 n32 open_tree_attr sys_open_tree_attr
468 n32 file_getattr sys_file_getattr
469 n32 file_setattr sys_file_setattr
+470 n32 listns sys_listns
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 7a7049c2c307..9b92bddf06b5 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -384,3 +384,4 @@
467 n64 open_tree_attr sys_open_tree_attr
468 n64 file_getattr sys_file_getattr
469 n64 file_setattr sys_file_setattr
+470 n64 listns sys_listns
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index d330274f0601..f810b8a55716 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -457,3 +457,4 @@
467 o32 open_tree_attr sys_open_tree_attr
468 o32 file_getattr sys_file_getattr
469 o32 file_setattr sys_file_setattr
+470 o32 listns sys_listns
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 88a788a7b18d..39bdacaa530b 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -468,3 +468,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index b453e80dfc00..ec4458cdb97b 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -560,3 +560,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 8a6744d658db..5863787ab036 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -472,3 +472,4 @@
467 common open_tree_attr sys_open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr sys_file_setattr
+470 common listns sys_listns sys_listns
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 5e9c9eff5539..969c11325ade 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -473,3 +473,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index ebb7d06d1044..39aa26b6a50b 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -515,3 +515,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 4877e16da69a..e979a3eac7a3 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -475,3 +475,4 @@
467 i386 open_tree_attr sys_open_tree_attr
468 i386 file_getattr sys_file_getattr
469 i386 file_setattr sys_file_setattr
+470 i386 listns sys_listns
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index ced2a1deecd7..8a4ac4841be6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -394,6 +394,7 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 374e4cb788d8..438a3b170402 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -440,3 +440,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/fs/libfs.c b/fs/libfs.c
index 96e3d7fc7fc6..1661dcb7d983 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -680,6 +680,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
s->s_export_op = ctx->eops;
s->s_xattr = ctx->xattr;
s->s_time_gran = 1;
+ s->s_d_flags |= ctx->s_d_flags;
root = new_inode(s);
if (!root)
return -ENOMEM;
diff --git a/fs/mount.h b/fs/mount.h
index f13a28752d0b..2d28ef2a3aed 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -27,6 +27,7 @@ struct mnt_namespace {
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
refcount_t passive; /* number references not pinning @mounts */
+ bool is_anon;
} __randomize_layout;
struct mnt_pcp {
@@ -175,7 +176,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry)
static inline bool is_anon_ns(struct mnt_namespace *ns)
{
- return ns->ns.ns_id == 0;
+ return ns->is_anon;
}
static inline bool anon_ns_root(const struct mount *m)
diff --git a/fs/namespace.c b/fs/namespace.c
index a7fd9682bcf9..d7afac807ac3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4090,8 +4090,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
dec_mnt_namespaces(ucounts);
return ERR_PTR(ret);
}
- if (!anon)
- ns_tree_gen_id(&new_ns->ns);
+ ns_tree_gen_id(new_ns);
+
+ new_ns->is_anon = anon;
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
init_waitqueue_head(&new_ns->poll);
@@ -5982,11 +5983,8 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
}
struct mnt_namespace init_mnt_ns = {
- .ns.inum = ns_init_inum(&init_mnt_ns),
- .ns.ops = &mntns_operations,
+ .ns = NS_COMMON_INIT(init_mnt_ns),
.user_ns = &init_user_ns,
- .ns.__ns_ref = REFCOUNT_INIT(1),
- .ns.ns_type = ns_common_type(&init_mnt_ns),
.passive = REFCOUNT_INIT(1),
.mounts = RB_ROOT,
.poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll),
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 79b026a36fb6..a80f8d2a4122 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -58,6 +58,8 @@ const struct dentry_operations ns_dentry_operations = {
static void nsfs_evict(struct inode *inode)
{
struct ns_common *ns = inode->i_private;
+
+ __ns_ref_active_put(ns);
clear_inode(inode);
ns->ops->put(ns);
}
@@ -408,6 +410,7 @@ static const struct super_operations nsfs_ops = {
.statfs = simple_statfs,
.evict_inode = nsfs_evict,
.show_path = nsfs_show_path,
+ .drop_inode = inode_just_drop,
};
static int nsfs_init_inode(struct inode *inode, void *data)
@@ -418,6 +421,16 @@ static int nsfs_init_inode(struct inode *inode, void *data)
inode->i_mode |= S_IRUGO;
inode->i_fop = &ns_file_operations;
inode->i_ino = ns->inum;
+
+ /*
+ * Bring the namespace subtree back to life if we have to. This
+ * can happen when e.g., all processes using a network namespace
+ * and all namespace files or namespace file bind-mounts have
+ * died but there are still sockets pinning it. The SIOCGSKNS
+ * ioctl on such a socket will resurrect the relevant namespace
+ * subtree.
+ */
+ __ns_ref_active_get(ns);
return 0;
}
@@ -458,6 +471,45 @@ static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
return FILEID_NSFS;
}
+bool is_current_namespace(struct ns_common *ns)
+{
+ switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+ case CLONE_NEWCGROUP:
+ return current_in_namespace(to_cg_ns(ns));
+#endif
+#ifdef CONFIG_IPC_NS
+ case CLONE_NEWIPC:
+ return current_in_namespace(to_ipc_ns(ns));
+#endif
+ case CLONE_NEWNS:
+ return current_in_namespace(to_mnt_ns(ns));
+#ifdef CONFIG_NET_NS
+ case CLONE_NEWNET:
+ return current_in_namespace(to_net_ns(ns));
+#endif
+#ifdef CONFIG_PID_NS
+ case CLONE_NEWPID:
+ return current_in_namespace(to_pid_ns(ns));
+#endif
+#ifdef CONFIG_TIME_NS
+ case CLONE_NEWTIME:
+ return current_in_namespace(to_time_ns(ns));
+#endif
+#ifdef CONFIG_USER_NS
+ case CLONE_NEWUSER:
+ return current_in_namespace(to_user_ns(ns));
+#endif
+#ifdef CONFIG_UTS_NS
+ case CLONE_NEWUTS:
+ return current_in_namespace(to_uts_ns(ns));
+#endif
+ default:
+ VFS_WARN_ON_ONCE(true);
+ return false;
+ }
+}
+
static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
int fh_len, int fh_type)
{
@@ -483,18 +535,35 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
return NULL;
}
+ if (!fid->ns_id)
+ return NULL;
+ /* Either both are set or both are unset. */
+ if (!fid->ns_inum != !fid->ns_type)
+ return NULL;
+
scoped_guard(rcu) {
ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type);
if (!ns)
return NULL;
VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id);
- VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type);
- if (ns->inum != fid->ns_inum)
+ if (fid->ns_inum && (fid->ns_inum != ns->inum))
+ return NULL;
+ if (fid->ns_type && (fid->ns_type != ns->ns_type))
return NULL;
- if (!__ns_ref_get(ns))
+ /*
+ * This is racy because we're not actually taking an
+ * active reference. IOW, it could happen that the
+ * namespace becomes inactive after this check.
+ * We don't care because nsfs_init_inode() will just
+ * resurrect the relevant namespace tree for us. If it
+ * has been active here we just allow it's resurrection.
+ * We could try to take an active reference here and
+ * then drop it again. But really, why bother.
+ */
+ if (!ns_get_unless_inactive(ns))
return NULL;
}
@@ -590,6 +659,8 @@ static int nsfs_init_fs_context(struct fs_context *fc)
struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
if (!ctx)
return -ENOMEM;
+ fc->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
+ ctx->s_d_flags |= DCACHE_DONTCACHE;
ctx->ops = &nsfs_ops;
ctx->eops = &nsfs_export_operations;
ctx->dops = &ns_dentry_operations;
@@ -612,3 +683,27 @@ void __init nsfs_init(void)
nsfs_root_path.mnt = nsfs_mnt;
nsfs_root_path.dentry = nsfs_mnt->mnt_root;
}
+
+void nsproxy_ns_active_get(struct nsproxy *ns)
+{
+ ns_ref_active_get(ns->mnt_ns);
+ ns_ref_active_get(ns->uts_ns);
+ ns_ref_active_get(ns->ipc_ns);
+ ns_ref_active_get(ns->pid_ns_for_children);
+ ns_ref_active_get(ns->cgroup_ns);
+ ns_ref_active_get(ns->net_ns);
+ ns_ref_active_get(ns->time_ns);
+ ns_ref_active_get(ns->time_ns_for_children);
+}
+
+void nsproxy_ns_active_put(struct nsproxy *ns)
+{
+ ns_ref_active_put(ns->mnt_ns);
+ ns_ref_active_put(ns->uts_ns);
+ ns_ref_active_put(ns->ipc_ns);
+ ns_ref_active_put(ns->pid_ns_for_children);
+ ns_ref_active_put(ns->cgroup_ns);
+ ns_ref_active_put(ns->net_ns);
+ ns_ref_active_put(ns->time_ns);
+ ns_ref_active_put(ns->time_ns_for_children);
+}
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 0ef5b47d796a..78dee3c201af 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -454,7 +454,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct task_struct *task __free(put_task) = NULL;
struct nsproxy *nsp __free(put_nsproxy) = NULL;
struct ns_common *ns_common = NULL;
- struct pid_namespace *pid_ns;
if (!pidfs_ioctl_valid(cmd))
return -ENOIOCTLCMD;
@@ -496,66 +495,64 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
switch (cmd) {
/* Namespaces that hang of nsproxy. */
case PIDFD_GET_CGROUP_NAMESPACE:
- if (IS_ENABLED(CONFIG_CGROUPS)) {
- get_cgroup_ns(nsp->cgroup_ns);
- ns_common = to_ns_common(nsp->cgroup_ns);
- }
+ if (!ns_ref_get(nsp->cgroup_ns))
+ break;
+ ns_common = to_ns_common(nsp->cgroup_ns);
break;
case PIDFD_GET_IPC_NAMESPACE:
- if (IS_ENABLED(CONFIG_IPC_NS)) {
- get_ipc_ns(nsp->ipc_ns);
- ns_common = to_ns_common(nsp->ipc_ns);
- }
+ if (!ns_ref_get(nsp->ipc_ns))
+ break;
+ ns_common = to_ns_common(nsp->ipc_ns);
break;
case PIDFD_GET_MNT_NAMESPACE:
- get_mnt_ns(nsp->mnt_ns);
+ if (!ns_ref_get(nsp->mnt_ns))
+ break;
ns_common = to_ns_common(nsp->mnt_ns);
break;
case PIDFD_GET_NET_NAMESPACE:
- if (IS_ENABLED(CONFIG_NET_NS)) {
- ns_common = to_ns_common(nsp->net_ns);
- get_net_ns(ns_common);
- }
+ if (!ns_ref_get(nsp->net_ns))
+ break;
+ ns_common = to_ns_common(nsp->net_ns);
break;
case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
- if (IS_ENABLED(CONFIG_PID_NS)) {
- get_pid_ns(nsp->pid_ns_for_children);
- ns_common = to_ns_common(nsp->pid_ns_for_children);
- }
+ if (!ns_ref_get(nsp->pid_ns_for_children))
+ break;
+ ns_common = to_ns_common(nsp->pid_ns_for_children);
break;
case PIDFD_GET_TIME_NAMESPACE:
- if (IS_ENABLED(CONFIG_TIME_NS)) {
- get_time_ns(nsp->time_ns);
- ns_common = to_ns_common(nsp->time_ns);
- }
+ if (!ns_ref_get(nsp->time_ns))
+ break;
+ ns_common = to_ns_common(nsp->time_ns);
break;
case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
- if (IS_ENABLED(CONFIG_TIME_NS)) {
- get_time_ns(nsp->time_ns_for_children);
- ns_common = to_ns_common(nsp->time_ns_for_children);
- }
+ if (!ns_ref_get(nsp->time_ns_for_children))
+ break;
+ ns_common = to_ns_common(nsp->time_ns_for_children);
break;
case PIDFD_GET_UTS_NAMESPACE:
- if (IS_ENABLED(CONFIG_UTS_NS)) {
- get_uts_ns(nsp->uts_ns);
- ns_common = to_ns_common(nsp->uts_ns);
- }
+ if (!ns_ref_get(nsp->uts_ns))
+ break;
+ ns_common = to_ns_common(nsp->uts_ns);
break;
/* Namespaces that don't hang of nsproxy. */
case PIDFD_GET_USER_NAMESPACE:
- if (IS_ENABLED(CONFIG_USER_NS)) {
- rcu_read_lock();
- ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns)));
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ struct user_namespace *user_ns;
+
+ user_ns = task_cred_xxx(task, user_ns);
+ if (!ns_ref_get(user_ns))
+ break;
+ ns_common = to_ns_common(user_ns);
}
break;
case PIDFD_GET_PID_NAMESPACE:
- if (IS_ENABLED(CONFIG_PID_NS)) {
- rcu_read_lock();
+ scoped_guard(rcu) {
+ struct pid_namespace *pid_ns;
+
pid_ns = task_active_pid_ns(task);
- if (pid_ns)
- ns_common = to_ns_common(get_pid_ns(pid_ns));
- rcu_read_unlock();
+ if (!ns_ref_get(pid_ns))
+ break;
+ ns_common = to_ns_common(pid_ns);
}
break;
default:
@@ -1022,6 +1019,7 @@ static int pidfs_init_fs_context(struct fs_context *fc)
fc->s_iflags |= SB_I_NOEXEC;
fc->s_iflags |= SB_I_NODEV;
+ ctx->s_d_flags |= DCACHE_DONTCACHE;
ctx->ops = &pidfs_sops;
ctx->eops = &pidfs_export_operations;
ctx->dops = &pidfs_dentry_operations;
diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h
new file mode 100644
index 000000000000..b332b019b29c
--- /dev/null
+++ b/include/linux/ns/ns_common_types.h
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NS_COMMON_TYPES_H
+#define _LINUX_NS_COMMON_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/ns/nstree_types.h>
+#include <linux/rbtree.h>
+#include <linux/refcount.h>
+#include <linux/types.h>
+
+struct cgroup_namespace;
+struct dentry;
+struct ipc_namespace;
+struct mnt_namespace;
+struct net;
+struct pid_namespace;
+struct proc_ns_operations;
+struct time_namespace;
+struct user_namespace;
+struct uts_namespace;
+
+extern struct cgroup_namespace init_cgroup_ns;
+extern struct ipc_namespace init_ipc_ns;
+extern struct mnt_namespace init_mnt_ns;
+extern struct net init_net;
+extern struct pid_namespace init_pid_ns;
+extern struct time_namespace init_time_ns;
+extern struct user_namespace init_user_ns;
+extern struct uts_namespace init_uts_ns;
+
+extern const struct proc_ns_operations cgroupns_operations;
+extern const struct proc_ns_operations ipcns_operations;
+extern const struct proc_ns_operations mntns_operations;
+extern const struct proc_ns_operations netns_operations;
+extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations pidns_for_children_operations;
+extern const struct proc_ns_operations timens_operations;
+extern const struct proc_ns_operations timens_for_children_operations;
+extern const struct proc_ns_operations userns_operations;
+extern const struct proc_ns_operations utsns_operations;
+
+/*
+ * Namespace lifetimes are managed via a two-tier reference counting model:
+ *
+ * (1) __ns_ref (refcount_t): Main reference count tracking memory
+ * lifetime. Controls when the namespace structure itself is freed.
+ * It also pins the namespace on the namespace trees whereas (2)
+ * only regulates their visibility to userspace.
+ *
+ * (2) __ns_ref_active (atomic_t): Reference count tracking active users.
+ * Controls visibility of the namespace in the namespace trees.
+ * Any live task that uses the namespace (via nsproxy or cred) holds
+ * an active reference. Any open file descriptor or bind-mount of
+ * the namespace holds an active reference. Once all tasks have
+ * called exited their namespaces and all file descriptors and
+ * bind-mounts have been released the active reference count drops
+ * to zero and the namespace becomes inactive. IOW, the namespace
+ * cannot be listed or opened via file handles anymore.
+ *
+ * Note that it is valid to transition from active to inactive and
+ * back from inactive to active e.g., when resurrecting an inactive
+ * namespace tree via the SIOCGSKNS ioctl().
+ *
+ * Relationship and lifecycle states:
+ *
+ * - Active (__ns_ref_active > 0):
+ * Namespace is actively used and visible to userspace. The namespace
+ * can be reopened via /proc/<pid>/ns/<ns_type>, via namespace file
+ * handles, or discovered via listns().
+ *
+ * - Inactive (__ns_ref_active == 0, __ns_ref > 0):
+ * No tasks are actively using the namespace and it isn't pinned by
+ * any bind-mounts or open file descriptors anymore. But the namespace
+ * is still kept alive by internal references. For example, the user
+ * namespace could be pinned by an open file through file->f_cred
+ * references when one of the now defunct tasks had opened a file and
+ * handed the file descriptor off to another process via a UNIX
+ * sockets. Such references keep the namespace structure alive through
+ * __ns_ref but will not hold an active reference.
+ *
+ * - Destroyed (__ns_ref == 0):
+ * No references remain. The namespace is removed from the tree and freed.
+ *
+ * State transitions:
+ *
+ * Active -> Inactive:
+ * When the last task using the namespace exits it drops its active
+ * references to all namespaces. However, user and pid namespaces
+ * remain accessible until the task has been reaped.
+ *
+ * Inactive -> Active:
+ * An inactive namespace tree might be resurrected due to e.g., the
+ * SIOCGSKNS ioctl() on a socket.
+ *
+ * Inactive -> Destroyed:
+ * When __ns_ref drops to zero the namespace is removed from the
+ * namespaces trees and the memory is freed (after RCU grace period).
+ *
+ * Initial namespaces:
+ * Boot-time namespaces (init_net, init_pid_ns, etc.) start with
+ * __ns_ref_active = 1 and remain active forever.
+ *
+ * @ns_type: type of namespace (e.g., CLONE_NEWNET)
+ * @stashed: cached dentry to be used by the vfs
+ * @ops: namespace operations
+ * @inum: namespace inode number (quickly recycled for non-initial namespaces)
+ * @__ns_ref: main reference count (do not use directly)
+ * @ns_tree: namespace tree nodes and active reference count
+ */
+struct ns_common {
+ u32 ns_type;
+ struct dentry *stashed;
+ const struct proc_ns_operations *ops;
+ unsigned int inum;
+ refcount_t __ns_ref; /* do not use directly */
+ union {
+ struct ns_tree;
+ struct rcu_head ns_rcu;
+ };
+};
+
+#define to_ns_common(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: &(__ns)->ns, \
+ const struct cgroup_namespace *: &(__ns)->ns, \
+ struct ipc_namespace *: &(__ns)->ns, \
+ const struct ipc_namespace *: &(__ns)->ns, \
+ struct mnt_namespace *: &(__ns)->ns, \
+ const struct mnt_namespace *: &(__ns)->ns, \
+ struct net *: &(__ns)->ns, \
+ const struct net *: &(__ns)->ns, \
+ struct pid_namespace *: &(__ns)->ns, \
+ const struct pid_namespace *: &(__ns)->ns, \
+ struct time_namespace *: &(__ns)->ns, \
+ const struct time_namespace *: &(__ns)->ns, \
+ struct user_namespace *: &(__ns)->ns, \
+ const struct user_namespace *: &(__ns)->ns, \
+ struct uts_namespace *: &(__ns)->ns, \
+ const struct uts_namespace *: &(__ns)->ns)
+
+#define ns_init_inum(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
+ struct ipc_namespace *: IPC_NS_INIT_INO, \
+ struct mnt_namespace *: MNT_NS_INIT_INO, \
+ struct net *: NET_NS_INIT_INO, \
+ struct pid_namespace *: PID_NS_INIT_INO, \
+ struct time_namespace *: TIME_NS_INIT_INO, \
+ struct user_namespace *: USER_NS_INIT_INO, \
+ struct uts_namespace *: UTS_NS_INIT_INO)
+
+#define ns_init_ns(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: &init_cgroup_ns, \
+ struct ipc_namespace *: &init_ipc_ns, \
+ struct mnt_namespace *: &init_mnt_ns, \
+ struct net *: &init_net, \
+ struct pid_namespace *: &init_pid_ns, \
+ struct time_namespace *: &init_time_ns, \
+ struct user_namespace *: &init_user_ns, \
+ struct uts_namespace *: &init_uts_ns)
+
+#define ns_init_id(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CGROUP_NS_INIT_ID, \
+ struct ipc_namespace *: IPC_NS_INIT_ID, \
+ struct mnt_namespace *: MNT_NS_INIT_ID, \
+ struct net *: NET_NS_INIT_ID, \
+ struct pid_namespace *: PID_NS_INIT_ID, \
+ struct time_namespace *: TIME_NS_INIT_ID, \
+ struct user_namespace *: USER_NS_INIT_ID, \
+ struct uts_namespace *: UTS_NS_INIT_ID)
+
+#define to_ns_operations(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
+ struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \
+ struct mnt_namespace *: &mntns_operations, \
+ struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \
+ struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \
+ struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \
+ struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \
+ struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL))
+
+#define ns_common_type(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CLONE_NEWCGROUP, \
+ struct ipc_namespace *: CLONE_NEWIPC, \
+ struct mnt_namespace *: CLONE_NEWNS, \
+ struct net *: CLONE_NEWNET, \
+ struct pid_namespace *: CLONE_NEWPID, \
+ struct time_namespace *: CLONE_NEWTIME, \
+ struct user_namespace *: CLONE_NEWUSER, \
+ struct uts_namespace *: CLONE_NEWUTS)
+
+#endif /* _LINUX_NS_COMMON_TYPES_H */
diff --git a/include/linux/ns/nstree_types.h b/include/linux/ns/nstree_types.h
new file mode 100644
index 000000000000..2fb28ee31efb
--- /dev/null
+++ b/include/linux/ns/nstree_types.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
+#ifndef _LINUX_NSTREE_TYPES_H
+#define _LINUX_NSTREE_TYPES_H
+
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+/**
+ * struct ns_tree_root - Root of a namespace tree
+ * @ns_rb: Red-black tree root for efficient lookups
+ * @ns_list_head: List head for sequential iteration
+ *
+ * Each namespace tree maintains both an rbtree (for O(log n) lookups)
+ * and a list (for efficient sequential iteration). The list is kept in
+ * the same sorted order as the rbtree.
+ */
+struct ns_tree_root {
+ struct rb_root ns_rb;
+ struct list_head ns_list_head;
+};
+
+/**
+ * struct ns_tree_node - Node in a namespace tree
+ * @ns_node: Red-black tree node
+ * @ns_list_entry: List entry for sequential iteration
+ *
+ * Represents a namespace's position in a tree. Each namespace has
+ * multiple tree nodes for different trees (unified, per-type, owner).
+ */
+struct ns_tree_node {
+ struct rb_node ns_node;
+ struct list_head ns_list_entry;
+};
+
+/**
+ * struct ns_tree - Namespace tree nodes and active reference count
+ * @ns_id: Unique namespace identifier
+ * @__ns_ref_active: Active reference count (do not use directly)
+ * @ns_unified_node: Node in the global namespace tree
+ * @ns_tree_node: Node in the per-type namespace tree
+ * @ns_owner_node: Node in the owner namespace's tree of owned namespaces
+ * @ns_owner_root: Root of the tree of namespaces owned by this namespace
+ * (only used when this namespace is an owner)
+ */
+struct ns_tree {
+ u64 ns_id;
+ atomic_t __ns_ref_active;
+ struct ns_tree_node ns_unified_node;
+ struct ns_tree_node ns_tree_node;
+ struct ns_tree_node ns_owner_node;
+ struct ns_tree_root ns_owner_root;
+};
+
+#endif /* _LINUX_NSTREE_TYPES_H */
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index f5b68b8abb54..825f5865bfc5 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -2,122 +2,44 @@
#ifndef _LINUX_NS_COMMON_H
#define _LINUX_NS_COMMON_H
+#include <linux/ns/ns_common_types.h>
#include <linux/refcount.h>
-#include <linux/rbtree.h>
+#include <linux/vfsdebug.h>
#include <uapi/linux/sched.h>
+#include <uapi/linux/nsfs.h>
-struct proc_ns_operations;
-
-struct cgroup_namespace;
-struct ipc_namespace;
-struct mnt_namespace;
-struct net;
-struct pid_namespace;
-struct time_namespace;
-struct user_namespace;
-struct uts_namespace;
-
-extern struct cgroup_namespace init_cgroup_ns;
-extern struct ipc_namespace init_ipc_ns;
-extern struct mnt_namespace init_mnt_ns;
-extern struct net init_net;
-extern struct pid_namespace init_pid_ns;
-extern struct time_namespace init_time_ns;
-extern struct user_namespace init_user_ns;
-extern struct uts_namespace init_uts_ns;
-
-extern const struct proc_ns_operations netns_operations;
-extern const struct proc_ns_operations utsns_operations;
-extern const struct proc_ns_operations ipcns_operations;
-extern const struct proc_ns_operations pidns_operations;
-extern const struct proc_ns_operations pidns_for_children_operations;
-extern const struct proc_ns_operations userns_operations;
-extern const struct proc_ns_operations mntns_operations;
-extern const struct proc_ns_operations cgroupns_operations;
-extern const struct proc_ns_operations timens_operations;
-extern const struct proc_ns_operations timens_for_children_operations;
-
-struct ns_common {
- u32 ns_type;
- struct dentry *stashed;
- const struct proc_ns_operations *ops;
- unsigned int inum;
- refcount_t __ns_ref; /* do not use directly */
- union {
- struct {
- u64 ns_id;
- struct rb_node ns_tree_node;
- struct list_head ns_list_node;
- };
- struct rcu_head ns_rcu;
- };
-};
-
+bool is_current_namespace(struct ns_common *ns);
int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum);
void __ns_common_free(struct ns_common *ns);
+struct ns_common *__must_check ns_owner(struct ns_common *ns);
+
+static __always_inline bool is_ns_init_inum(const struct ns_common *ns)
+{
+ VFS_WARN_ON_ONCE(ns->inum == 0);
+ return unlikely(in_range(ns->inum, MNT_NS_INIT_INO,
+ IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1));
+}
+
+static __always_inline bool is_ns_init_id(const struct ns_common *ns)
+{
+ VFS_WARN_ON_ONCE(ns->ns_id == 0);
+ return ns->ns_id <= NS_LAST_INIT_ID;
+}
-#define to_ns_common(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: &(__ns)->ns, \
- const struct cgroup_namespace *: &(__ns)->ns, \
- struct ipc_namespace *: &(__ns)->ns, \
- const struct ipc_namespace *: &(__ns)->ns, \
- struct mnt_namespace *: &(__ns)->ns, \
- const struct mnt_namespace *: &(__ns)->ns, \
- struct net *: &(__ns)->ns, \
- const struct net *: &(__ns)->ns, \
- struct pid_namespace *: &(__ns)->ns, \
- const struct pid_namespace *: &(__ns)->ns, \
- struct time_namespace *: &(__ns)->ns, \
- const struct time_namespace *: &(__ns)->ns, \
- struct user_namespace *: &(__ns)->ns, \
- const struct user_namespace *: &(__ns)->ns, \
- struct uts_namespace *: &(__ns)->ns, \
- const struct uts_namespace *: &(__ns)->ns)
-
-#define ns_init_inum(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
- struct ipc_namespace *: IPC_NS_INIT_INO, \
- struct mnt_namespace *: MNT_NS_INIT_INO, \
- struct net *: NET_NS_INIT_INO, \
- struct pid_namespace *: PID_NS_INIT_INO, \
- struct time_namespace *: TIME_NS_INIT_INO, \
- struct user_namespace *: USER_NS_INIT_INO, \
- struct uts_namespace *: UTS_NS_INIT_INO)
-
-#define ns_init_ns(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: &init_cgroup_ns, \
- struct ipc_namespace *: &init_ipc_ns, \
- struct mnt_namespace *: &init_mnt_ns, \
- struct net *: &init_net, \
- struct pid_namespace *: &init_pid_ns, \
- struct time_namespace *: &init_time_ns, \
- struct user_namespace *: &init_user_ns, \
- struct uts_namespace *: &init_uts_ns)
-
-#define to_ns_operations(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
- struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \
- struct mnt_namespace *: &mntns_operations, \
- struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \
- struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \
- struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \
- struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \
- struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL))
-
-#define ns_common_type(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: CLONE_NEWCGROUP, \
- struct ipc_namespace *: CLONE_NEWIPC, \
- struct mnt_namespace *: CLONE_NEWNS, \
- struct net *: CLONE_NEWNET, \
- struct pid_namespace *: CLONE_NEWPID, \
- struct time_namespace *: CLONE_NEWTIME, \
- struct user_namespace *: CLONE_NEWUSER, \
- struct uts_namespace *: CLONE_NEWUTS)
+#define NS_COMMON_INIT(nsname) \
+{ \
+ .ns_type = ns_common_type(&nsname), \
+ .ns_id = ns_init_id(&nsname), \
+ .inum = ns_init_inum(&nsname), \
+ .ops = to_ns_operations(&nsname), \
+ .stashed = NULL, \
+ .__ns_ref = REFCOUNT_INIT(1), \
+ .__ns_ref_active = ATOMIC_INIT(1), \
+ .ns_unified_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry), \
+ .ns_tree_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry), \
+ .ns_owner_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_node.ns_list_entry), \
+ .ns_owner_root.ns_list_head = LIST_HEAD_INIT(nsname.ns.ns_owner_root.ns_list_head), \
+}
#define ns_common_init(__ns) \
__ns_common_init(to_ns_common(__ns), \
@@ -133,21 +55,96 @@ void __ns_common_free(struct ns_common *ns);
#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns)))
+static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns)
+{
+ return atomic_read(&ns->__ns_ref_active);
+}
+
+static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns)
+{
+ return refcount_read(&ns->__ns_ref);
+}
+
static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
{
- return refcount_dec_and_test(&ns->__ns_ref);
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return false;
+ }
+ if (refcount_dec_and_test(&ns->__ns_ref)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
+ return true;
+ }
+ return false;
}
static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
{
- return refcount_inc_not_zero(&ns->__ns_ref);
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return true;
+ }
+ if (refcount_inc_not_zero(&ns->__ns_ref))
+ return true;
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
+ return false;
}
-#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref)
-#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref)
-#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns)))
-#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns)))
-#define ns_ref_put_and_lock(__ns, __lock) \
- refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock))
+static __always_inline void __ns_ref_inc(struct ns_common *ns)
+{
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return;
+ }
+ refcount_inc(&ns->__ns_ref);
+}
+
+static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns,
+ spinlock_t *ns_lock)
+{
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return false;
+ }
+ return refcount_dec_and_lock(&ns->__ns_ref, ns_lock);
+}
+
+#define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns)))
+#define ns_ref_inc(__ns) \
+ do { if (__ns) __ns_ref_inc(to_ns_common((__ns))); } while (0)
+#define ns_ref_get(__ns) \
+ ((__ns) ? __ns_ref_get(to_ns_common((__ns))) : false)
+#define ns_ref_put(__ns) \
+ ((__ns) ? __ns_ref_put(to_ns_common((__ns))) : false)
+#define ns_ref_put_and_lock(__ns, __ns_lock) \
+ ((__ns) ? __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) : false)
+
+#define ns_ref_active_read(__ns) \
+ ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0)
+
+void __ns_ref_active_put(struct ns_common *ns);
+
+#define ns_ref_active_put(__ns) \
+ do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0)
+
+static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns)
+{
+ if (!__ns_ref_active_read(ns)) {
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ return NULL;
+ }
+ if (!__ns_ref_get(ns))
+ return NULL;
+ return ns;
+}
+
+void __ns_ref_active_get(struct ns_common *ns);
+
+#define ns_ref_active_get(__ns) \
+ do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)
#endif
diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h
index e5a5fa83d36b..731b67fc2fec 100644
--- a/include/linux/nsfs.h
+++ b/include/linux/nsfs.h
@@ -37,4 +37,7 @@ void nsfs_init(void);
#define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns)
+void nsproxy_ns_active_get(struct nsproxy *ns);
+void nsproxy_ns_active_put(struct nsproxy *ns);
+
#endif /* _LINUX_NSFS_H */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index bd118a187dec..5a67648721c7 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -93,10 +93,13 @@ static inline struct cred *nsset_cred(struct nsset *set)
*/
int copy_namespaces(u64 flags, struct task_struct *tsk);
-void exit_task_namespaces(struct task_struct *tsk);
+void switch_cred_namespaces(const struct cred *old, const struct cred *new);
+void exit_nsproxy_namespaces(struct task_struct *tsk);
+void get_cred_namespaces(struct task_struct *tsk);
+void exit_cred_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
int exec_task_namespaces(void);
-void free_nsproxy(struct nsproxy *ns);
+void deactivate_nsproxy(struct nsproxy *ns);
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
struct cred *, struct fs_struct *);
int __init nsproxy_cache_init(void);
@@ -104,7 +107,7 @@ int __init nsproxy_cache_init(void);
static inline void put_nsproxy(struct nsproxy *ns)
{
if (refcount_dec_and_test(&ns->count))
- free_nsproxy(ns);
+ deactivate_nsproxy(ns);
}
static inline void get_nsproxy(struct nsproxy *ns)
diff --git a/include/linux/nstree.h b/include/linux/nstree.h
index 8b8636690473..175e4625bfa6 100644
--- a/include/linux/nstree.h
+++ b/include/linux/nstree.h
@@ -1,22 +1,34 @@
/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
#ifndef _LINUX_NSTREE_H
#define _LINUX_NSTREE_H
-#include <linux/ns_common.h>
+#include <linux/ns/nstree_types.h>
#include <linux/nsproxy.h>
#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/rculist.h>
#include <linux/cookie.h>
+#include <uapi/linux/nsfs.h>
-extern struct ns_tree cgroup_ns_tree;
-extern struct ns_tree ipc_ns_tree;
-extern struct ns_tree mnt_ns_tree;
-extern struct ns_tree net_ns_tree;
-extern struct ns_tree pid_ns_tree;
-extern struct ns_tree time_ns_tree;
-extern struct ns_tree user_ns_tree;
-extern struct ns_tree uts_ns_tree;
+struct ns_common;
+
+extern struct ns_tree_root cgroup_ns_tree;
+extern struct ns_tree_root ipc_ns_tree;
+extern struct ns_tree_root mnt_ns_tree;
+extern struct ns_tree_root net_ns_tree;
+extern struct ns_tree_root pid_ns_tree;
+extern struct ns_tree_root time_ns_tree;
+extern struct ns_tree_root user_ns_tree;
+extern struct ns_tree_root uts_ns_tree;
+
+void ns_tree_node_init(struct ns_tree_node *node);
+void ns_tree_root_init(struct ns_tree_root *root);
+bool ns_tree_node_empty(const struct ns_tree_node *node);
+struct rb_node *ns_tree_node_add(struct ns_tree_node *node,
+ struct ns_tree_root *root,
+ int (*cmp)(struct rb_node *, const struct rb_node *));
+void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root);
#define to_ns_tree(__ns) \
_Generic((__ns), \
@@ -29,17 +41,21 @@ extern struct ns_tree uts_ns_tree;
struct user_namespace *: &(user_ns_tree), \
struct uts_namespace *: &(uts_ns_tree))
-u64 ns_tree_gen_id(struct ns_common *ns);
-void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree);
-void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree);
+#define ns_tree_gen_id(__ns) \
+ __ns_tree_gen_id(to_ns_common(__ns), \
+ (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0))
+
+u64 __ns_tree_gen_id(struct ns_common *ns, u64 id);
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree);
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree);
struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type);
struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
- struct ns_tree *ns_tree,
+ struct ns_tree_root *ns_tree,
bool previous);
-static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
+static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_tree, u64 id)
{
- ns_tree_gen_id(ns);
+ __ns_tree_gen_id(ns, id);
__ns_tree_add_raw(ns, ns_tree);
}
@@ -59,7 +75,9 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
* This function assigns a new id to the namespace and adds it to the
* appropriate namespace tree and list.
*/
-#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns))
+#define ns_tree_add(__ns) \
+ __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns), \
+ (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0))
/**
* ns_tree_remove - Remove a namespace from a namespace tree
@@ -73,6 +91,6 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
#define ns_tree_adjoined_rcu(__ns, __previous) \
__ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous)
-#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node))
+#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node.ns_node))
#endif /* _LINUX_NSTREE_H */
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 445517a72ad0..0e7ae12c96d2 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -61,8 +61,7 @@ static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
- if (ns != &init_pid_ns)
- ns_ref_inc(ns);
+ ns_ref_inc(ns);
return ns;
}
diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
index 2503f7625d65..a651e60d9410 100644
--- a/include/linux/pseudo_fs.h
+++ b/include/linux/pseudo_fs.h
@@ -9,6 +9,7 @@ struct pseudo_fs_context {
const struct xattr_handler * const *xattr;
const struct dentry_operations *dops;
unsigned long magic;
+ unsigned int s_d_flags;
};
struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 66c06fcdfe19..cf84d98964b2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -77,6 +77,7 @@ struct cachestat_range;
struct cachestat;
struct statmount;
struct mnt_id_req;
+struct ns_id_req;
struct xattr_args;
struct file_attr;
@@ -437,6 +438,9 @@ asmlinkage long sys_statmount(const struct mnt_id_req __user *req,
asmlinkage long sys_listmount(const struct mnt_id_req __user *req,
u64 __user *mnt_ids, size_t nr_mnt_ids,
unsigned int flags);
+asmlinkage long sys_listns(const struct ns_id_req __user *req,
+ u64 __user *ns_ids, size_t nr_ns_ids,
+ unsigned int flags);
asmlinkage long sys_truncate(const char __user *path, long length);
asmlinkage long sys_ftruncate(unsigned int fd, off_t length);
#if BITS_PER_LONG == 32
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 9a9aebbf96b9..9c3be157397e 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -166,13 +166,13 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX;
}
-#ifdef CONFIG_USER_NS
-
static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
return container_of(ns, struct user_namespace, ns);
}
+#ifdef CONFIG_USER_NS
+
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
if (ns)
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 04e0077fb4c9..942370b3f5d2 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -857,9 +857,11 @@ __SYSCALL(__NR_open_tree_attr, sys_open_tree_attr)
__SYSCALL(__NR_file_getattr, sys_file_getattr)
#define __NR_file_setattr 469
__SYSCALL(__NR_file_setattr, sys_file_setattr)
+#define __NR_listns 470
+__SYSCALL(__NR_listns, sys_listns)
#undef __NR_syscalls
-#define __NR_syscalls 470
+#define __NR_syscalls 471
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index e098759ec917..a25e38d1c874 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -67,4 +67,62 @@ struct nsfs_file_handle {
#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */
#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */
+enum init_ns_id {
+ IPC_NS_INIT_ID = 1ULL,
+ UTS_NS_INIT_ID = 2ULL,
+ USER_NS_INIT_ID = 3ULL,
+ PID_NS_INIT_ID = 4ULL,
+ CGROUP_NS_INIT_ID = 5ULL,
+ TIME_NS_INIT_ID = 6ULL,
+ NET_NS_INIT_ID = 7ULL,
+ MNT_NS_INIT_ID = 8ULL,
+#ifdef __KERNEL__
+ NS_LAST_INIT_ID = MNT_NS_INIT_ID,
+#endif
+};
+
+enum ns_type {
+ TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */
+ MNT_NS = (1ULL << 17), /* CLONE_NEWNS */
+ CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */
+ UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */
+ IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */
+ USER_NS = (1ULL << 28), /* CLONE_NEWUSER */
+ PID_NS = (1ULL << 29), /* CLONE_NEWPID */
+ NET_NS = (1ULL << 30), /* CLONE_NEWNET */
+};
+
+/**
+ * struct ns_id_req - namespace ID request structure
+ * @size: size of this structure
+ * @spare: reserved for future use
+ * @filter: filter mask
+ * @ns_id: last namespace id
+ * @user_ns_id: owning user namespace ID
+ *
+ * Structure for passing namespace ID and miscellaneous parameters to
+ * statns(2) and listns(2).
+ *
+ * For statns(2) @param represents the request mask.
+ * For listns(2) @param represents the last listed mount id (or zero).
+ */
+struct ns_id_req {
+ __u32 size;
+ __u32 spare;
+ __u64 ns_id;
+ struct /* listns */ {
+ __u32 ns_type;
+ __u32 spare2;
+ __u64 user_ns_id;
+ };
+};
+
+/*
+ * Special @user_ns_id value that can be passed to listns()
+ */
+#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */
+
+/* List of all ns_id_req versions. */
+#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */
+
#endif /* __LINUX_NSFS_H */
diff --git a/init/version-timestamp.c b/init/version-timestamp.c
index d071835121c2..375726e05f69 100644
--- a/init/version-timestamp.c
+++ b/init/version-timestamp.c
@@ -8,8 +8,7 @@
#include <linux/utsname.h>
struct uts_namespace init_uts_ns = {
- .ns.ns_type = ns_common_type(&init_uts_ns),
- .ns.__ns_ref = REFCOUNT_INIT(2),
+ .ns = NS_COMMON_INIT(init_uts_ns),
.name = {
.sysname = UTS_SYSNAME,
.nodename = UTS_NODENAME,
@@ -19,10 +18,6 @@ struct uts_namespace init_uts_ns = {
.domainname = UTS_DOMAINNAME,
},
.user_ns = &init_user_ns,
- .ns.inum = ns_init_inum(&init_uts_ns),
-#ifdef CONFIG_UTS_NS
- .ns.ops = &utsns_operations,
-#endif
};
/* FIXED STRINGS! Don't touch! */
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 7a03f6d03de3..e28f0cecb2ec 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -27,13 +27,8 @@ DEFINE_SPINLOCK(mq_lock);
* and not CONFIG_IPC_NS.
*/
struct ipc_namespace init_ipc_ns = {
- .ns.__ns_ref = REFCOUNT_INIT(1),
+ .ns = NS_COMMON_INIT(init_ipc_ns),
.user_ns = &init_user_ns,
- .ns.inum = ns_init_inum(&init_ipc_ns),
-#ifdef CONFIG_IPC_NS
- .ns.ops = &ipcns_operations,
-#endif
- .ns.ns_type = ns_common_type(&init_ipc_ns),
};
struct msg_msgseg {
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 59b12fcb40bd..c0dbfdd9015f 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -66,6 +66,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
if (err)
goto fail_free;
+ ns_tree_gen_id(ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
@@ -86,7 +87,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
sem_init_ns(ns);
shm_init_ns(ns);
- ns_tree_add(ns);
+ ns_tree_add_raw(ns);
return ns;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index fdee387f0d6b..2bf3951ca88f 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -250,12 +250,9 @@ bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .ns.__ns_ref = REFCOUNT_INIT(2),
+ .ns = NS_COMMON_INIT(init_cgroup_ns),
.user_ns = &init_user_ns,
- .ns.ops = &cgroupns_operations,
- .ns.inum = ns_init_inum(&init_cgroup_ns),
.root_cset = &init_css_set,
- .ns.ns_type = ns_common_type(&init_cgroup_ns),
};
static struct file_system_type cgroup2_fs_type;
@@ -1522,9 +1519,9 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
} else {
/*
* NOTE: This function may be called from bpf_cgroup_from_id()
- * on a task which has already passed exit_task_namespaces() and
- * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
- * cgroups visible for lookups.
+ * on a task which has already passed exit_nsproxy_namespaces()
+ * and nsproxy == NULL. Fall back to cgrp_dfl_root which will
+ * make all cgroups visible for lookups.
*/
return &cgrp_dfl_root.cgrp;
}
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index fdbe57578e68..db9617556dd7 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -30,7 +30,6 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
ret = ns_common_init(new_ns);
if (ret)
return ERR_PTR(ret);
- ns_tree_add(new_ns);
return no_free_ptr(new_ns);
}
@@ -86,6 +85,7 @@ struct cgroup_namespace *copy_cgroup_ns(u64 flags,
new_ns->ucounts = ucounts;
new_ns->root_cset = cset;
+ ns_tree_add(new_ns);
return new_ns;
}
diff --git a/kernel/cred.c b/kernel/cred.c
index dbf6b687dc5c..a6e7f580df14 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -306,6 +306,7 @@ int copy_creds(struct task_struct *p, u64 clone_flags)
kdebug("share_creds(%p{%ld})",
p->cred, atomic_long_read(&p->cred->usage));
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ get_cred_namespaces(p);
return 0;
}
@@ -343,6 +344,8 @@ int copy_creds(struct task_struct *p, u64 clone_flags)
p->cred = p->real_cred = get_cred(new);
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ get_cred_namespaces(p);
+
return 0;
error_put:
@@ -435,10 +438,13 @@ int commit_creds(struct cred *new)
*/
if (new->user != old->user || new->user_ns != old->user_ns)
inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user || new->user_ns != old->user_ns)
dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+ if (new->user_ns != old->user_ns)
+ switch_cred_namespaces(old, new);
/* send notifications */
if (!uid_eq(new->uid, old->uid) ||
diff --git a/kernel/exit.c b/kernel/exit.c
index 9f74e8f1c431..988e16efd66b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -291,6 +291,7 @@ repeat:
write_unlock_irq(&tasklist_lock);
/* @thread_pid can't go away until free_pids() below */
proc_flush_pid(thread_pid);
+ exit_cred_namespaces(p);
add_device_randomness(&p->se.sum_exec_runtime,
sizeof(p->se.sum_exec_runtime));
free_pids(post.pids);
@@ -962,7 +963,7 @@ void __noreturn do_exit(long code)
exit_fs(tsk);
if (group_dead)
disassociate_ctty(1);
- exit_task_namespaces(tsk);
+ exit_nsproxy_namespaces(tsk);
exit_task_work(tsk);
exit_thread(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a9..f1857672426e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2453,7 +2453,7 @@ bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
- exit_task_namespaces(p);
+ exit_nsproxy_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
mm_clear_owner(p->mm, p);
@@ -2487,6 +2487,7 @@ bad_fork_cleanup_delayacct:
delayacct_tsk_free(p);
bad_fork_cleanup_count:
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ exit_cred_namespaces(p);
exit_creds(p);
bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD);
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index c1fb2bad6d72..bdc3c86231d3 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -1,7 +1,10 @@
// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
#include <linux/ns_common.h>
+#include <linux/nstree.h>
#include <linux/proc_ns.h>
+#include <linux/user_namespace.h>
#include <linux/vfsdebug.h>
#ifdef CONFIG_DEBUG_VFS
@@ -52,26 +55,257 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
{
+ int ret = 0;
+
refcount_set(&ns->__ns_ref, 1);
ns->stashed = NULL;
ns->ops = ops;
ns->ns_id = 0;
ns->ns_type = ns_type;
- RB_CLEAR_NODE(&ns->ns_tree_node);
- INIT_LIST_HEAD(&ns->ns_list_node);
+ ns_tree_node_init(&ns->ns_tree_node);
+ ns_tree_node_init(&ns->ns_unified_node);
+ ns_tree_node_init(&ns->ns_owner_node);
+ ns_tree_root_init(&ns->ns_owner_root);
#ifdef CONFIG_DEBUG_VFS
ns_debug(ns, ops);
#endif
- if (inum) {
+ if (inum)
ns->inum = inum;
- return 0;
- }
- return proc_alloc_inum(&ns->inum);
+ else
+ ret = proc_alloc_inum(&ns->inum);
+ if (ret)
+ return ret;
+ /*
+ * Tree ref starts at 0. It's incremented when namespace enters
+ * active use (installed in nsproxy) and decremented when all
+ * active uses are gone. Initial namespaces are always active.
+ */
+ if (is_ns_init_inum(ns))
+ atomic_set(&ns->__ns_ref_active, 1);
+ else
+ atomic_set(&ns->__ns_ref_active, 0);
+ return 0;
}
void __ns_common_free(struct ns_common *ns)
{
proc_free_inum(ns->inum);
}
+
+struct ns_common *__must_check ns_owner(struct ns_common *ns)
+{
+ struct user_namespace *owner;
+
+ if (unlikely(!ns->ops))
+ return NULL;
+ VFS_WARN_ON_ONCE(!ns->ops->owner);
+ owner = ns->ops->owner(ns);
+ VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
+ if (!owner)
+ return NULL;
+ /* Skip init_user_ns as it's always active */
+ if (owner == &init_user_ns)
+ return NULL;
+ return to_ns_common(owner);
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down.
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ * net_ns pid_ns
+ * \ /
+ * + +
+ * user_ns1 (2)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + + +
+ * user_ns2 (3)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + - +
+ * user_ns2 (2)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * The iteration stops once we reach a namespace that still has active
+ * references.
+ */
+void __ns_ref_active_put(struct ns_common *ns)
+{
+ /* Initial namespaces are always active. */
+ if (is_ns_init_id(ns))
+ return;
+
+ if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
+ return;
+ }
+
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
+
+ for (;;) {
+ ns = ns_owner(ns);
+ if (!ns)
+ return;
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
+ return;
+ }
+ }
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down. This makes it possible to efficiently
+ * resurrect a namespace tree:
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ * net_ns pid_ns
+ * \ /
+ * + +
+ * user_ns1 (2)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + + +
+ * user_ns2 (3)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + - +
+ * user_ns2 (2)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * Assume the whole tree is dead but all namespaces are still active:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * - - -
+ * user_ns2 (0)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
+ *
+ * net_ns pid_ns
+ * \ /
+ * + -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * - + -
+ * user_ns2 (0)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If net_ns had a zero reference count and we bumped it we also need to
+ * take another reference on its owning user namespace. Similarly, if
+ * pid_ns had a zero reference count it also needs to take another
+ * reference on its owning user namespace. So both net_ns and pid_ns
+ * will each have their own reference on the owning user namespace.
+ *
+ * If the owning user namespace user_ns1 had a zero reference count then
+ * it also needs to take another reference on its owning user namespace
+ * and so on.
+ */
+void __ns_ref_active_get(struct ns_common *ns)
+{
+ int prev;
+
+ /* Initial namespaces are always active. */
+ if (is_ns_init_id(ns))
+ return;
+
+ /* If we didn't resurrect the namespace we're done. */
+ prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+ VFS_WARN_ON_ONCE(prev < 0);
+ if (likely(prev))
+ return;
+
+ /*
+ * We did resurrect it. Walk the ownership hierarchy upwards
+ * until we found an owning user namespace that is active.
+ */
+ for (;;) {
+ ns = ns_owner(ns);
+ if (!ns)
+ return;
+
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+ VFS_WARN_ON_ONCE(prev < 0);
+ if (likely(prev))
+ return;
+ }
+}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 19aa64ab08c8..259c4b4f1eeb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
#include <linux/syscalls.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
+#include <linux/nstree.h>
static struct kmem_cache *nsproxy_cachep;
@@ -59,6 +60,25 @@ static inline struct nsproxy *create_nsproxy(void)
return nsproxy;
}
+static inline void nsproxy_free(struct nsproxy *ns)
+{
+ put_mnt_ns(ns->mnt_ns);
+ put_uts_ns(ns->uts_ns);
+ put_ipc_ns(ns->ipc_ns);
+ put_pid_ns(ns->pid_ns_for_children);
+ put_time_ns(ns->time_ns);
+ put_time_ns(ns->time_ns_for_children);
+ put_cgroup_ns(ns->cgroup_ns);
+ put_net(ns->net_ns);
+ kmem_cache_free(nsproxy_cachep, ns);
+}
+
+void deactivate_nsproxy(struct nsproxy *ns)
+{
+ nsproxy_ns_active_put(ns);
+ nsproxy_free(ns);
+}
+
/*
* Create new nsproxy and all of its the associated namespaces.
* Return the newly created nsproxy. Do not attach this to the task,
@@ -179,23 +199,11 @@ int copy_namespaces(u64 flags, struct task_struct *tsk)
if ((flags & CLONE_VM) == 0)
timens_on_fork(new_ns, tsk);
+ nsproxy_ns_active_get(new_ns);
tsk->nsproxy = new_ns;
return 0;
}
-void free_nsproxy(struct nsproxy *ns)
-{
- put_mnt_ns(ns->mnt_ns);
- put_uts_ns(ns->uts_ns);
- put_ipc_ns(ns->ipc_ns);
- put_pid_ns(ns->pid_ns_for_children);
- put_time_ns(ns->time_ns);
- put_time_ns(ns->time_ns_for_children);
- put_cgroup_ns(ns->cgroup_ns);
- put_net(ns->net_ns);
- kmem_cache_free(nsproxy_cachep, ns);
-}
-
/*
* Called from unshare. Unshare all the namespaces part of nsproxy.
* On success, returns the new nsproxy.
@@ -232,6 +240,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
might_sleep();
+ if (new)
+ nsproxy_ns_active_get(new);
+
task_lock(p);
ns = p->nsproxy;
p->nsproxy = new;
@@ -241,11 +252,27 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
put_nsproxy(ns);
}
-void exit_task_namespaces(struct task_struct *p)
+void exit_nsproxy_namespaces(struct task_struct *p)
{
switch_task_namespaces(p, NULL);
}
+void switch_cred_namespaces(const struct cred *old, const struct cred *new)
+{
+ ns_ref_active_get(new->user_ns);
+ ns_ref_active_put(old->user_ns);
+}
+
+void get_cred_namespaces(struct task_struct *tsk)
+{
+ ns_ref_active_get(tsk->real_cred->user_ns);
+}
+
+void exit_cred_namespaces(struct task_struct *tsk)
+{
+ ns_ref_active_put(tsk->real_cred->user_ns);
+}
+
int exec_task_namespaces(void)
{
struct task_struct *tsk = current;
@@ -315,7 +342,7 @@ static void put_nsset(struct nsset *nsset)
if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
free_fs_struct(nsset->fs);
if (nsset->nsproxy)
- free_nsproxy(nsset->nsproxy);
+ nsproxy_free(nsset->nsproxy);
}
static int prepare_nsset(unsigned flags, struct nsset *nsset)
diff --git a/kernel/nstree.c b/kernel/nstree.c
index b24a320a11a6..f36c59e6951d 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -1,140 +1,261 @@
// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
#include <linux/nstree.h>
#include <linux/proc_ns.h>
+#include <linux/rculist.h>
#include <linux/vfsdebug.h>
+#include <linux/syscalls.h>
+#include <linux/user_namespace.h>
-/**
- * struct ns_tree - Namespace tree
- * @ns_tree: Rbtree of namespaces of a particular type
- * @ns_list: Sequentially walkable list of all namespaces of this type
- * @ns_tree_lock: Seqlock to protect the tree and list
- * @type: type of namespaces in this tree
- */
-struct ns_tree {
- struct rb_root ns_tree;
- struct list_head ns_list;
- seqlock_t ns_tree_lock;
- int type;
+static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock);
+
+DEFINE_LOCK_GUARD_0(ns_tree_writer,
+ write_seqlock(&ns_tree_lock),
+ write_sequnlock(&ns_tree_lock))
+
+DEFINE_LOCK_GUARD_0(ns_tree_locked_reader,
+ read_seqlock_excl(&ns_tree_lock),
+ read_sequnlock_excl(&ns_tree_lock))
+
+static struct ns_tree_root ns_unified_root = { /* protected by ns_tree_lock */
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(ns_unified_root.ns_list_head),
};
-struct ns_tree mnt_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock),
- .type = CLONE_NEWNS,
+struct ns_tree_root mnt_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(mnt_ns_tree.ns_list_head),
};
-struct ns_tree net_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock),
- .type = CLONE_NEWNET,
+struct ns_tree_root net_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(net_ns_tree.ns_list_head),
};
EXPORT_SYMBOL_GPL(net_ns_tree);
-struct ns_tree uts_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock),
- .type = CLONE_NEWUTS,
+struct ns_tree_root uts_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(uts_ns_tree.ns_list_head),
};
-struct ns_tree user_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock),
- .type = CLONE_NEWUSER,
+struct ns_tree_root user_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(user_ns_tree.ns_list_head),
};
-struct ns_tree ipc_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock),
- .type = CLONE_NEWIPC,
+struct ns_tree_root ipc_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(ipc_ns_tree.ns_list_head),
};
-struct ns_tree pid_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock),
- .type = CLONE_NEWPID,
+struct ns_tree_root pid_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(pid_ns_tree.ns_list_head),
};
-struct ns_tree cgroup_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock),
- .type = CLONE_NEWCGROUP,
+struct ns_tree_root cgroup_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(cgroup_ns_tree.ns_list_head),
};
-struct ns_tree time_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock),
- .type = CLONE_NEWTIME,
+struct ns_tree_root time_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(time_ns_tree.ns_list_head),
};
-DEFINE_COOKIE(namespace_cookie);
+/**
+ * ns_tree_node_init - Initialize a namespace tree node
+ * @node: The node to initialize
+ *
+ * Initializes both the rbtree node and list entry.
+ */
+void ns_tree_node_init(struct ns_tree_node *node)
+{
+ RB_CLEAR_NODE(&node->ns_node);
+ INIT_LIST_HEAD(&node->ns_list_entry);
+}
+
+/**
+ * ns_tree_root_init - Initialize a namespace tree root
+ * @root: The root to initialize
+ *
+ * Initializes both the rbtree root and list head.
+ */
+void ns_tree_root_init(struct ns_tree_root *root)
+{
+ root->ns_rb = RB_ROOT;
+ INIT_LIST_HEAD(&root->ns_list_head);
+}
+
+/**
+ * ns_tree_node_empty - Check if a namespace tree node is empty
+ * @node: The node to check
+ *
+ * Returns true if the node is not in any tree.
+ */
+bool ns_tree_node_empty(const struct ns_tree_node *node)
+{
+ return RB_EMPTY_NODE(&node->ns_node);
+}
+
+/**
+ * ns_tree_node_add - Add a node to a namespace tree
+ * @node: The node to add
+ * @root: The tree root to add to
+ * @cmp: Comparison function for rbtree insertion
+ *
+ * Adds the node to both the rbtree and the list, maintaining sorted order.
+ * The list is maintained in the same order as the rbtree to enable efficient
+ * iteration.
+ *
+ * Returns: NULL if insertion succeeded, existing node if duplicate found
+ */
+struct rb_node *ns_tree_node_add(struct ns_tree_node *node,
+ struct ns_tree_root *root,
+ int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+ struct rb_node *ret, *prev;
+
+ /* Add to rbtree */
+ ret = rb_find_add_rcu(&node->ns_node, &root->ns_rb, cmp);
+
+ /* Add to list in sorted order */
+ prev = rb_prev(&node->ns_node);
+ if (!prev) {
+ /* No previous node, add at head */
+ list_add_rcu(&node->ns_list_entry, &root->ns_list_head);
+ } else {
+ /* Add after previous node */
+ struct ns_tree_node *prev_node;
+ prev_node = rb_entry(prev, struct ns_tree_node, ns_node);
+ list_add_rcu(&node->ns_list_entry, &prev_node->ns_list_entry);
+ }
+
+ return ret;
+}
+
+/**
+ * ns_tree_node_del - Remove a node from a namespace tree
+ * @node: The node to remove
+ * @root: The tree root to remove from
+ *
+ * Removes the node from both the rbtree and the list atomically.
+ */
+void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root)
+{
+ rb_erase(&node->ns_node, &root->ns_rb);
+ RB_CLEAR_NODE(&node->ns_node);
+ list_bidir_del_rcu(&node->ns_list_entry);
+}
static inline struct ns_common *node_to_ns(const struct rb_node *node)
{
if (!node)
return NULL;
- return rb_entry(node, struct ns_common, ns_tree_node);
+ return rb_entry(node, struct ns_common, ns_tree_node.ns_node);
}
-static inline int ns_cmp(struct rb_node *a, const struct rb_node *b)
+static inline struct ns_common *node_to_ns_unified(const struct rb_node *node)
{
- struct ns_common *ns_a = node_to_ns(a);
- struct ns_common *ns_b = node_to_ns(b);
- u64 ns_id_a = ns_a->ns_id;
- u64 ns_id_b = ns_b->ns_id;
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_unified_node.ns_node);
+}
- if (ns_id_a < ns_id_b)
+static inline struct ns_common *node_to_ns_owner(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_owner_node.ns_node);
+}
+
+static int ns_id_cmp(u64 id_a, u64 id_b)
+{
+ if (id_a < id_b)
return -1;
- if (ns_id_a > ns_id_b)
+ if (id_a > id_b)
return 1;
return 0;
}
-void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
+static int ns_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ return ns_id_cmp(node_to_ns(a)->ns_id, node_to_ns(b)->ns_id);
+}
+
+static int ns_cmp_unified(struct rb_node *a, const struct rb_node *b)
+{
+ return ns_id_cmp(node_to_ns_unified(a)->ns_id, node_to_ns_unified(b)->ns_id);
+}
+
+static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b)
{
- struct rb_node *node, *prev;
+ return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id);
+}
+
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree)
+{
+ struct rb_node *node;
+ const struct proc_ns_operations *ops = ns->ops;
VFS_WARN_ON_ONCE(!ns->ns_id);
- write_seqlock(&ns_tree->ns_tree_lock);
+ guard(ns_tree_writer)();
- VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+ /* Add to per-type tree and list */
+ node = ns_tree_node_add(&ns->ns_tree_node, ns_tree, ns_cmp);
- node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp);
- /*
- * If there's no previous entry simply add it after the
- * head and if there is add it after the previous entry.
- */
- prev = rb_prev(&ns->ns_tree_node);
- if (!prev)
- list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list);
- else
- list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node);
+ /* Add to unified tree and list */
+ ns_tree_node_add(&ns->ns_unified_node, &ns_unified_root, ns_cmp_unified);
+
+ /* Add to owner's tree if applicable */
+ if (ops) {
+ struct user_namespace *user_ns;
- write_sequnlock(&ns_tree->ns_tree_lock);
+ VFS_WARN_ON_ONCE(!ops->owner);
+ user_ns = ops->owner(ns);
+ if (user_ns) {
+ struct ns_common *owner = &user_ns->ns;
+ VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+ /* Insert into owner's tree and list */
+ ns_tree_node_add(&ns->ns_owner_node, &owner->ns_owner_root, ns_cmp_owner);
+ } else {
+ /* Only the initial user namespace doesn't have an owner. */
+ VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns));
+ }
+ }
VFS_WARN_ON_ONCE(node);
}
-void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree)
{
- VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node));
- VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node));
- VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+ const struct proc_ns_operations *ops = ns->ops;
+ struct user_namespace *user_ns;
+
+ VFS_WARN_ON_ONCE(ns_tree_node_empty(&ns->ns_tree_node));
+ VFS_WARN_ON_ONCE(list_empty(&ns->ns_tree_node.ns_list_entry));
+
+ write_seqlock(&ns_tree_lock);
+
+ /* Remove from per-type tree and list */
+ ns_tree_node_del(&ns->ns_tree_node, ns_tree);
+
+ /* Remove from unified tree and list */
+ ns_tree_node_del(&ns->ns_unified_node, &ns_unified_root);
- write_seqlock(&ns_tree->ns_tree_lock);
- rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree);
- list_bidir_del_rcu(&ns->ns_list_node);
- RB_CLEAR_NODE(&ns->ns_tree_node);
- write_sequnlock(&ns_tree->ns_tree_lock);
+ /* Remove from owner's tree if applicable */
+ if (ops) {
+ user_ns = ops->owner(ns);
+ if (user_ns) {
+ struct ns_common *owner = &user_ns->ns;
+ ns_tree_node_del(&ns->ns_owner_node, &owner->ns_owner_root);
+ }
+ }
+
+ write_sequnlock(&ns_tree_lock);
}
EXPORT_SYMBOL_GPL(__ns_tree_remove);
@@ -150,8 +271,19 @@ static int ns_find(const void *key, const struct rb_node *node)
return 0;
}
+static int ns_find_unified(const void *key, const struct rb_node *node)
+{
+ const u64 ns_id = *(u64 *)key;
+ const struct ns_common *ns = node_to_ns_unified(node);
-static struct ns_tree *ns_tree_from_type(int ns_type)
+ if (ns_id < ns->ns_id)
+ return -1;
+ if (ns_id > ns->ns_id)
+ return 1;
+ return 0;
+}
+
+static struct ns_tree_root *ns_tree_from_type(int ns_type)
{
switch (ns_type) {
case CLONE_NEWCGROUP:
@@ -175,73 +307,507 @@ static struct ns_tree *ns_tree_from_type(int ns_type)
return NULL;
}
-struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id)
{
- struct ns_tree *ns_tree;
struct rb_node *node;
unsigned int seq;
- RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
+ do {
+ seq = read_seqbegin(&ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_unified_root.ns_rb, ns_find_unified);
+ if (node)
+ break;
+ } while (read_seqretry(&ns_tree_lock, seq));
+
+ return node_to_ns_unified(node);
+}
+
+static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ struct ns_tree_root *ns_tree;
+ struct rb_node *node;
+ unsigned int seq;
ns_tree = ns_tree_from_type(ns_type);
if (!ns_tree)
return NULL;
do {
- seq = read_seqbegin(&ns_tree->ns_tree_lock);
- node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find);
+ seq = read_seqbegin(&ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_tree->ns_rb, ns_find);
if (node)
break;
- } while (read_seqretry(&ns_tree->ns_tree_lock, seq));
+ } while (read_seqretry(&ns_tree_lock, seq));
- if (!node)
- return NULL;
+ return node_to_ns(node);
+}
- VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type);
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
- return node_to_ns(node);
+ if (ns_type)
+ return __ns_tree_lookup_rcu(ns_id, ns_type);
+
+ return __ns_unified_tree_lookup_rcu(ns_id);
}
/**
- * ns_tree_adjoined_rcu - find the next/previous namespace in the same
+ * __ns_tree_adjoined_rcu - find the next/previous namespace in the same
* tree
* @ns: namespace to start from
+ * @ns_tree: namespace tree to search in
* @previous: if true find the previous namespace, otherwise the next
*
* Find the next or previous namespace in the same tree as @ns. If
* there is no next/previous namespace, -ENOENT is returned.
*/
struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
- struct ns_tree *ns_tree, bool previous)
+ struct ns_tree_root *ns_tree, bool previous)
{
struct list_head *list;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage");
if (previous)
- list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node));
+ list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_tree_node.ns_list_entry));
else
- list = rcu_dereference(list_next_rcu(&ns->ns_list_node));
- if (list_is_head(list, &ns_tree->ns_list))
+ list = rcu_dereference(list_next_rcu(&ns->ns_tree_node.ns_list_entry));
+ if (list_is_head(list, &ns_tree->ns_list_head))
return ERR_PTR(-ENOENT);
- VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type);
-
- return list_entry_rcu(list, struct ns_common, ns_list_node);
+ return list_entry_rcu(list, struct ns_common, ns_tree_node.ns_list_entry);
}
/**
- * ns_tree_gen_id - generate a new namespace id
+ * __ns_tree_gen_id - generate a new namespace id
* @ns: namespace to generate id for
+ * @id: if non-zero, this is the initial namespace and this is a fixed id
*
* Generates a new namespace id and assigns it to the namespace. All
* namespaces types share the same id space and thus can be compared
* directly. IOW, when two ids of two namespace are equal, they are
* identical.
*/
-u64 ns_tree_gen_id(struct ns_common *ns)
+u64 __ns_tree_gen_id(struct ns_common *ns, u64 id)
{
- guard(preempt)();
- ns->ns_id = gen_cookie_next(&namespace_cookie);
+ static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1);
+
+ if (id)
+ ns->ns_id = id;
+ else
+ ns->ns_id = atomic64_inc_return(&namespace_cookie);
return ns->ns_id;
}
+
+struct klistns {
+ u64 __user *uns_ids;
+ u32 nr_ns_ids;
+ u64 last_ns_id;
+ u64 user_ns_id;
+ u32 ns_type;
+ struct user_namespace *user_ns;
+ bool userns_capable;
+ struct ns_common *first_ns;
+};
+
+static void __free_klistns_free(const struct klistns *kls)
+{
+ if (kls->user_ns_id != LISTNS_CURRENT_USER)
+ put_user_ns(kls->user_ns);
+ if (kls->first_ns && kls->first_ns->ops)
+ kls->first_ns->ops->put(kls->first_ns);
+}
+
+#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS)
+
+static int copy_ns_id_req(const struct ns_id_req __user *req,
+ struct ns_id_req *kreq)
+{
+ int ret;
+ size_t usize;
+
+ BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0);
+
+ ret = get_user(usize, &req->size);
+ if (ret)
+ return -EFAULT;
+ if (unlikely(usize > PAGE_SIZE))
+ return -E2BIG;
+ if (unlikely(usize < NS_ID_REQ_SIZE_VER0))
+ return -EINVAL;
+ memset(kreq, 0, sizeof(*kreq));
+ ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
+ if (ret)
+ return ret;
+ if (kreq->spare != 0)
+ return -EINVAL;
+ if (kreq->ns_type & ~NS_ALL)
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq,
+ u64 __user *ns_ids, size_t nr_ns_ids)
+{
+ kls->last_ns_id = kreq->ns_id;
+ kls->user_ns_id = kreq->user_ns_id;
+ kls->nr_ns_ids = nr_ns_ids;
+ kls->ns_type = kreq->ns_type;
+ kls->uns_ids = ns_ids;
+ return 0;
+}
+
+/*
+ * Lookup a namespace owned by owner with id >= ns_id.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner)
+{
+ struct ns_common *ret = NULL;
+ struct rb_node *node;
+
+ VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+ guard(ns_tree_locked_reader)();
+
+ node = owner->ns_owner_root.ns_rb.rb_node;
+ while (node) {
+ struct ns_common *ns;
+
+ ns = node_to_ns_owner(node);
+ if (ns_id <= ns->ns_id) {
+ ret = ns;
+ if (ns_id == ns->ns_id)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+
+ if (ret)
+ ret = ns_get_unless_inactive(ret);
+ return ret;
+}
+
+static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type)
+{
+ struct ns_common *ns;
+
+ guard(rcu)();
+ ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type);
+ if (!ns)
+ return NULL;
+
+ if (!ns_get_unless_inactive(ns))
+ return NULL;
+
+ return ns;
+}
+
+static inline bool __must_check ns_requested(const struct klistns *kls,
+ const struct ns_common *ns)
+{
+ return !kls->ns_type || (kls->ns_type & ns->ns_type);
+}
+
+static inline bool __must_check may_list_ns(const struct klistns *kls,
+ struct ns_common *ns)
+{
+ if (kls->user_ns) {
+ if (kls->userns_capable)
+ return true;
+ } else {
+ struct ns_common *owner;
+ struct user_namespace *user_ns;
+
+ owner = ns_owner(ns);
+ if (owner)
+ user_ns = to_user_ns(owner);
+ else
+ user_ns = &init_user_ns;
+ if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN))
+ return true;
+ }
+
+ if (is_current_namespace(ns))
+ return true;
+
+ if (ns->ns_type != CLONE_NEWUSER)
+ return false;
+
+ if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN))
+ return true;
+
+ return false;
+}
+
+static inline void ns_put(struct ns_common *ns)
+{
+ if (ns && ns->ops)
+ ns->ops->put(ns);
+}
+
+DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T))
+
+static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls,
+ struct ns_common *candidate)
+{
+ struct ns_common *ns __free(ns_put) = NULL;
+
+ if (!ns_requested(kls, candidate))
+ return NULL;
+
+ ns = ns_get_unless_inactive(candidate);
+ if (!ns)
+ return NULL;
+
+ if (!may_list_ns(kls, ns))
+ return NULL;
+
+ return no_free_ptr(ns);
+}
+
+static ssize_t do_listns_userns(struct klistns *kls)
+{
+ u64 __user *ns_ids = kls->uns_ids;
+ size_t nr_ns_ids = kls->nr_ns_ids;
+ struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL;
+ const struct list_head *head;
+ ssize_t ret;
+
+ VFS_WARN_ON_ONCE(!kls->user_ns_id);
+
+ if (kls->user_ns_id == LISTNS_CURRENT_USER)
+ ns = to_ns_common(current_user_ns());
+ else if (kls->user_ns_id)
+ ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER);
+ if (!ns)
+ return -EINVAL;
+ kls->user_ns = to_user_ns(ns);
+
+ /*
+ * Use the rbtree to find the first namespace we care about and
+ * then use it's list entry to iterate from there.
+ */
+ if (kls->last_ns_id) {
+ kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns);
+ if (!kls->first_ns)
+ return -ENOENT;
+ first_ns = kls->first_ns;
+ }
+
+ ret = 0;
+ head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head;
+ kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN);
+
+ rcu_read_lock();
+
+ if (!first_ns)
+ first_ns = list_entry_rcu(head->next, typeof(*first_ns), ns_owner_node.ns_list_entry);
+
+ ns = first_ns;
+ list_for_each_entry_from_rcu(ns, head, ns_owner_node.ns_list_entry) {
+ struct ns_common *valid;
+
+ if (!nr_ns_ids)
+ break;
+
+ valid = legitimize_ns(kls, ns);
+ if (!valid)
+ continue;
+
+ rcu_read_unlock();
+
+ ns_put(prev);
+ prev = valid;
+
+ if (put_user(valid->ns_id, ns_ids + ret)) {
+ ns_put(prev);
+ return -EFAULT;
+ }
+
+ nr_ns_ids--;
+ ret++;
+
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
+ ns_put(prev);
+ return ret;
+}
+
+/*
+ * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type)
+{
+ struct ns_common *ret = NULL;
+ struct ns_tree_root *ns_tree = NULL;
+ struct rb_node *node;
+
+ if (ns_type) {
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return NULL;
+ }
+
+ guard(ns_tree_locked_reader)();
+
+ if (ns_tree)
+ node = ns_tree->ns_rb.rb_node;
+ else
+ node = ns_unified_root.ns_rb.rb_node;
+
+ while (node) {
+ struct ns_common *ns;
+
+ if (ns_type)
+ ns = node_to_ns(node);
+ else
+ ns = node_to_ns_unified(node);
+
+ if (ns_id <= ns->ns_id) {
+ if (ns_type)
+ ret = node_to_ns(node);
+ else
+ ret = node_to_ns_unified(node);
+ if (ns_id == ns->ns_id)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+
+ if (ret)
+ ret = ns_get_unless_inactive(ret);
+ return ret;
+}
+
+static inline struct ns_common *first_ns_common(const struct list_head *head,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return list_entry_rcu(head->next, struct ns_common, ns_tree_node.ns_list_entry);
+ return list_entry_rcu(head->next, struct ns_common, ns_unified_node.ns_list_entry);
+}
+
+static inline struct ns_common *next_ns_common(struct ns_common *ns,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return list_entry_rcu(ns->ns_tree_node.ns_list_entry.next, struct ns_common, ns_tree_node.ns_list_entry);
+ return list_entry_rcu(ns->ns_unified_node.ns_list_entry.next, struct ns_common, ns_unified_node.ns_list_entry);
+}
+
+static inline bool ns_common_is_head(struct ns_common *ns,
+ const struct list_head *head,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return &ns->ns_tree_node.ns_list_entry == head;
+ return &ns->ns_unified_node.ns_list_entry == head;
+}
+
+static ssize_t do_listns(struct klistns *kls)
+{
+ u64 __user *ns_ids = kls->uns_ids;
+ size_t nr_ns_ids = kls->nr_ns_ids;
+ struct ns_common *ns, *first_ns = NULL, *prev = NULL;
+ struct ns_tree_root *ns_tree = NULL;
+ const struct list_head *head;
+ u32 ns_type;
+ ssize_t ret;
+
+ if (hweight32(kls->ns_type) == 1)
+ ns_type = kls->ns_type;
+ else
+ ns_type = 0;
+
+ if (ns_type) {
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return -EINVAL;
+ }
+
+ if (kls->last_ns_id) {
+ kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type);
+ if (!kls->first_ns)
+ return -ENOENT;
+ first_ns = kls->first_ns;
+ }
+
+ ret = 0;
+ if (ns_tree)
+ head = &ns_tree->ns_list_head;
+ else
+ head = &ns_unified_root.ns_list_head;
+
+ rcu_read_lock();
+
+ if (!first_ns)
+ first_ns = first_ns_common(head, ns_tree);
+
+ for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids;
+ ns = next_ns_common(ns, ns_tree)) {
+ struct ns_common *valid;
+
+ valid = legitimize_ns(kls, ns);
+ if (!valid)
+ continue;
+
+ rcu_read_unlock();
+
+ ns_put(prev);
+ prev = valid;
+
+ if (put_user(valid->ns_id, ns_ids + ret)) {
+ ns_put(prev);
+ return -EFAULT;
+ }
+
+ nr_ns_ids--;
+ ret++;
+
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
+ ns_put(prev);
+ return ret;
+}
+
+SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req,
+ u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags)
+{
+ struct klistns klns __free(klistns_free) = {};
+ const size_t maxcount = 1000000;
+ struct ns_id_req kreq;
+ ssize_t ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (unlikely(nr_ns_ids > maxcount))
+ return -EOVERFLOW;
+
+ if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids)))
+ return -EFAULT;
+
+ ret = copy_ns_id_req(req, &kreq);
+ if (ret)
+ return ret;
+
+ ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids);
+ if (ret)
+ return ret;
+
+ if (kreq.user_ns_id)
+ return do_listns_userns(&klns);
+
+ return do_listns(&klns);
+}
diff --git a/kernel/pid.c b/kernel/pid.c
index 4fffec767a63..a31771bc89c1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -71,21 +71,16 @@ static int pid_max_max = PID_MAX_LIMIT;
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
- .ns.__ns_ref = REFCOUNT_INIT(2),
+ .ns = NS_COMMON_INIT(init_pid_ns),
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
- .ns.inum = ns_init_inum(&init_pid_ns),
-#ifdef CONFIG_PID_NS
- .ns.ops = &pidns_operations,
-#endif
.pid_max = PID_MAX_DEFAULT,
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
- .ns.ns_type = ns_common_type(&init_pid_ns),
};
EXPORT_SYMBOL_GPL(init_pid_ns);
@@ -117,9 +112,13 @@ static void delayed_put_pid(struct rcu_head *rhp)
void free_pid(struct pid *pid)
{
int i;
+ struct pid_namespace *active_ns;
lockdep_assert_not_held(&tasklist_lock);
+ active_ns = pid->numbers[pid->level].ns;
+ ns_ref_active_put(active_ns);
+
spin_lock(&pidmap_lock);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
@@ -283,6 +282,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
}
spin_unlock(&pidmap_lock);
idr_preload_end();
+ ns_ref_active_get(ns);
return pid;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 650be58d8d18..e48f5de41361 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -184,7 +184,7 @@ struct pid_namespace *copy_pid_ns(u64 flags,
void put_pid_ns(struct pid_namespace *ns)
{
- if (ns && ns != &init_pid_ns && ns_ref_put(ns))
+ if (ns && ns_ref_put(ns))
schedule_work(&ns->work);
}
EXPORT_SYMBOL_GPL(put_pid_ns);
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 5b6997f4dc3d..e76be24b132c 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -478,11 +478,8 @@ const struct proc_ns_operations timens_for_children_operations = {
};
struct time_namespace init_time_ns = {
- .ns.ns_type = ns_common_type(&init_time_ns),
- .ns.__ns_ref = REFCOUNT_INIT(3),
+ .ns = NS_COMMON_INIT(init_time_ns),
.user_ns = &init_user_ns,
- .ns.inum = ns_init_inum(&init_time_ns),
- .ns.ops = &timens_operations,
.frozen_offsets = true,
};
diff --git a/kernel/user.c b/kernel/user.c
index 0163665914c9..7aef4e679a6a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -35,6 +35,7 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc);
* and 1 for... ?
*/
struct user_namespace init_user_ns = {
+ .ns = NS_COMMON_INIT(init_user_ns),
.uid_map = {
{
.extent[0] = {
@@ -65,14 +66,8 @@ struct user_namespace init_user_ns = {
.nr_extents = 1,
},
},
- .ns.ns_type = ns_common_type(&init_user_ns),
- .ns.__ns_ref = REFCOUNT_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
- .ns.inum = ns_init_inum(&init_user_ns),
-#ifdef CONFIG_USER_NS
- .ns.ops = &userns_operations,
-#endif
.flags = USERNS_INIT_FLAGS,
#ifdef CONFIG_KEYS
.keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b0e0f22d7b21..83cbec4afcb3 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -439,7 +439,7 @@ static __net_init int setup_net(struct net *net)
LIST_HEAD(net_exit_list);
int error = 0;
- net->net_cookie = ns_tree_gen_id(&net->ns);
+ net->net_cookie = ns_tree_gen_id(net);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
index d1ae5e92c615..e74868be513c 100644
--- a/scripts/syscall.tbl
+++ b/scripts/syscall.tbl
@@ -410,3 +410,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h
index 33c9b578b3b2..a25e38d1c874 100644
--- a/tools/include/uapi/linux/nsfs.h
+++ b/tools/include/uapi/linux/nsfs.h
@@ -53,6 +53,76 @@ enum init_ns_ino {
TIME_NS_INIT_INO = 0xEFFFFFFAU,
NET_NS_INIT_INO = 0xEFFFFFF9U,
MNT_NS_INIT_INO = 0xEFFFFFF8U,
+#ifdef __KERNEL__
+ MNT_NS_ANON_INO = 0xEFFFFFF7U,
+#endif
};
+struct nsfs_file_handle {
+ __u64 ns_id;
+ __u32 ns_type;
+ __u32 ns_inum;
+};
+
+#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */
+#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */
+
+enum init_ns_id {
+ IPC_NS_INIT_ID = 1ULL,
+ UTS_NS_INIT_ID = 2ULL,
+ USER_NS_INIT_ID = 3ULL,
+ PID_NS_INIT_ID = 4ULL,
+ CGROUP_NS_INIT_ID = 5ULL,
+ TIME_NS_INIT_ID = 6ULL,
+ NET_NS_INIT_ID = 7ULL,
+ MNT_NS_INIT_ID = 8ULL,
+#ifdef __KERNEL__
+ NS_LAST_INIT_ID = MNT_NS_INIT_ID,
+#endif
+};
+
+enum ns_type {
+ TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */
+ MNT_NS = (1ULL << 17), /* CLONE_NEWNS */
+ CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */
+ UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */
+ IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */
+ USER_NS = (1ULL << 28), /* CLONE_NEWUSER */
+ PID_NS = (1ULL << 29), /* CLONE_NEWPID */
+ NET_NS = (1ULL << 30), /* CLONE_NEWNET */
+};
+
+/**
+ * struct ns_id_req - namespace ID request structure
+ * @size: size of this structure
+ * @spare: reserved for future use
+ * @filter: filter mask
+ * @ns_id: last namespace id
+ * @user_ns_id: owning user namespace ID
+ *
+ * Structure for passing namespace ID and miscellaneous parameters to
+ * statns(2) and listns(2).
+ *
+ * For statns(2) @param represents the request mask.
+ * For listns(2) @param represents the last listed mount id (or zero).
+ */
+struct ns_id_req {
+ __u32 size;
+ __u32 spare;
+ __u64 ns_id;
+ struct /* listns */ {
+ __u32 ns_type;
+ __u32 spare2;
+ __u64 user_ns_id;
+ };
+};
+
+/*
+ * Special @user_ns_id value that can be passed to listns()
+ */
+#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */
+
+/* List of all ns_id_req versions. */
+#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */
+
#endif /* __LINUX_NSFS_H */
diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c
index c43a69dffd83..a0c64f415a7f 100644
--- a/tools/testing/selftests/filesystems/utils.c
+++ b/tools/testing/selftests/filesystems/utils.c
@@ -487,7 +487,7 @@ int setup_userns(void)
uid_t uid = getuid();
gid_t gid = getgid();
- ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID);
+ ret = unshare(CLONE_NEWNS|CLONE_NEWUSER);
if (ret) {
ksft_exit_fail_msg("unsharing mountns and userns: %s\n",
strerror(errno));
diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore
index ccfb40837a73..0989e80da457 100644
--- a/tools/testing/selftests/namespaces/.gitignore
+++ b/tools/testing/selftests/namespaces/.gitignore
@@ -1,3 +1,12 @@
nsid_test
file_handle_test
init_ino_test
+ns_active_ref_test
+listns_test
+listns_permissions_test
+listns_efault_test
+siocgskns_test
+cred_change_test
+stress_test
+listns_pagination_bug
+regression_pidfd_setns_test
diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile
index 5fe4b3dc07d3..fbb821652c17 100644
--- a/tools/testing/selftests/namespaces/Makefile
+++ b/tools/testing/selftests/namespaces/Makefile
@@ -1,7 +1,29 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS += -lcap
-TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test
+TEST_GEN_PROGS := nsid_test \
+ file_handle_test \
+ init_ino_test \
+ ns_active_ref_test \
+ listns_test \
+ listns_permissions_test \
+ listns_efault_test \
+ siocgskns_test \
+ cred_change_test \
+ stress_test \
+ listns_pagination_bug \
+ regression_pidfd_setns_test
include ../lib.mk
+$(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c
+$(OUTPUT)/listns_test: ../filesystems/utils.c
+$(OUTPUT)/listns_permissions_test: ../filesystems/utils.c
+$(OUTPUT)/listns_efault_test: ../filesystems/utils.c
+$(OUTPUT)/siocgskns_test: ../filesystems/utils.c
+$(OUTPUT)/cred_change_test: ../filesystems/utils.c
+$(OUTPUT)/stress_test: ../filesystems/utils.c
+$(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c
+$(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c
+
diff --git a/tools/testing/selftests/namespaces/cred_change_test.c b/tools/testing/selftests/namespaces/cred_change_test.c
new file mode 100644
index 000000000000..7b4f5ad3f725
--- /dev/null
+++ b/tools/testing/selftests/namespaces/cred_change_test.c
@@ -0,0 +1,814 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/capability.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/nsfs.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Test credential changes and their impact on namespace active references.
+ */
+
+/*
+ * Test setuid() in a user namespace properly swaps active references.
+ * Create a user namespace with multiple UIDs mapped, then setuid() between them.
+ * Verify that the user namespace remains active throughout.
+ */
+TEST(setuid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int setuid_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace with multiple UIDs mapped (0-9) */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send namespace ID to parent */
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /*
+ * Perform multiple setuid() calls.
+ * Each setuid() triggers commit_creds() which should properly
+ * swap active references via switch_cred_namespaces().
+ */
+ for (setuid_count = 0; setuid_count < 50; setuid_count++) {
+ uid_t target_uid = (setuid_count % 10);
+ if (setuid(target_uid) < 0) {
+ if (errno != EPERM) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ TH_LOG("Child user namespace ID: %llu", (unsigned long long)userns_id);
+
+ /* Verify namespace is active while child is running */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+ ASSERT_TRUE(found);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive after child exits */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setuid() correctly preserved active references (no leak)");
+}
+
+/*
+ * Test setgid() in a user namespace properly handles active references.
+ */
+TEST(setgid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int setgid_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace with multiple GIDs mapped */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /* Perform multiple setgid() calls */
+ for (setgid_count = 0; setgid_count < 50; setgid_count++) {
+ gid_t target_gid = (setgid_count % 10);
+ if (setgid(target_gid) < 0) {
+ if (errno != EPERM) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setgid() correctly preserved active references (no leak)");
+}
+
+/*
+ * Test setresuid() which changes real, effective, and saved UIDs.
+ * This should properly swap active references via commit_creds().
+ */
+TEST(setresuid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int setres_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /* Perform multiple setresuid() calls */
+ for (setres_count = 0; setres_count < 30; setres_count++) {
+ uid_t uid1 = (setres_count % 5);
+ uid_t uid2 = ((setres_count + 1) % 5);
+ uid_t uid3 = ((setres_count + 2) % 5);
+
+ if (setresuid(uid1, uid2, uid3) < 0) {
+ if (errno != EPERM) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setresuid() correctly preserved active references (no leak)");
+}
+
+/*
+ * Test credential changes across multiple user namespaces.
+ * Create nested user namespaces and verify active reference tracking.
+ */
+TEST(cred_change_nested_userns)
+{
+ pid_t pid;
+ int status;
+ __u64 parent_userns_id, child_userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found_parent = false, found_child = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 parent_id, child_id;
+ uid_t orig_uid = getuid();
+
+ close(pipefd[0]);
+
+ /* Create first user namespace */
+ userns_fd = get_userns_fd(0, orig_uid, 1);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get first namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &parent_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create nested user namespace */
+ userns_fd = get_userns_fd(0, 0, 1);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get nested namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send both IDs to parent */
+ write(pipefd[1], &parent_id, sizeof(parent_id));
+ write(pipefd[1], &child_id, sizeof(child_id));
+
+ /* Perform some credential changes in nested namespace */
+ setuid(0);
+ setgid(0);
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ /* Read both namespace IDs */
+ if (read(pipefd[0], &parent_userns_id, sizeof(parent_userns_id)) != sizeof(parent_userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get parent namespace ID");
+ }
+
+ if (read(pipefd[0], &child_userns_id, sizeof(child_userns_id)) != sizeof(child_userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get child namespace ID");
+ }
+ close(pipefd[0]);
+
+ TH_LOG("Parent userns: %llu, Child userns: %llu",
+ (unsigned long long)parent_userns_id,
+ (unsigned long long)child_userns_id);
+
+ /* Verify both namespaces are active */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == parent_userns_id)
+ found_parent = true;
+ if (ns_ids[i] == child_userns_id)
+ found_child = true;
+ }
+
+ ASSERT_TRUE(found_parent);
+ ASSERT_TRUE(found_child);
+
+ /* Wait for child */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify both namespaces become inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found_parent = false;
+ found_child = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == parent_userns_id)
+ found_parent = true;
+ if (ns_ids[i] == child_userns_id)
+ found_child = true;
+ }
+
+ ASSERT_FALSE(found_parent);
+ ASSERT_FALSE(found_child);
+ TH_LOG("Nested user namespace credential changes preserved active refs (no leak)");
+}
+
+/*
+ * Test rapid credential changes don't cause refcount imbalances.
+ * This stress-tests the switch_cred_namespaces() logic.
+ */
+TEST(rapid_cred_changes_no_leak)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int change_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace with wider range of UIDs/GIDs */
+ userns_fd = get_userns_fd(0, orig_uid, 100);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /*
+ * Perform many rapid credential changes.
+ * Mix setuid, setgid, setreuid, setregid, setresuid, setresgid.
+ */
+ for (change_count = 0; change_count < 200; change_count++) {
+ switch (change_count % 6) {
+ case 0:
+ setuid(change_count % 50);
+ break;
+ case 1:
+ setgid(change_count % 50);
+ break;
+ case 2:
+ setreuid(change_count % 50, (change_count + 1) % 50);
+ break;
+ case 3:
+ setregid(change_count % 50, (change_count + 1) % 50);
+ break;
+ case 4:
+ setresuid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50);
+ break;
+ case 5:
+ setresgid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50);
+ break;
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ TH_LOG("Testing with user namespace ID: %llu", (unsigned long long)userns_id);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive (no leaked active refs) */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("200 rapid credential changes completed with no active ref leak");
+}
+
+/*
+ * Test setfsuid/setfsgid which change filesystem UID/GID.
+ * These also trigger credential changes but may have different code paths.
+ */
+TEST(setfsuid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int change_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /* Perform multiple setfsuid/setfsgid calls */
+ for (change_count = 0; change_count < 50; change_count++) {
+ setfsuid(change_count % 10);
+ setfsgid(change_count % 10);
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setfsuid/setfsgid correctly preserved active references (no leak)");
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c
new file mode 100644
index 000000000000..c7ed4023d7a8
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_efault_test.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "../pidfd/pidfd.h"
+#include "wrappers.h"
+
+/*
+ * Test listns() error handling with invalid buffer addresses.
+ *
+ * When the buffer pointer is invalid (e.g., crossing page boundaries
+ * into unmapped memory), listns() returns EINVAL.
+ *
+ * This test also creates mount namespaces that get destroyed during
+ * iteration, testing that namespace cleanup happens outside the RCU
+ * read lock.
+ */
+TEST(listns_partial_fault_with_ns_cleanup)
+{
+ void *map;
+ __u64 *ns_ids;
+ ssize_t ret;
+ long page_size;
+ pid_t pid, iter_pid;
+ int pidfds[5];
+ int sv[5][2];
+ int iter_pidfd;
+ int i, status;
+ char c;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ ASSERT_GT(page_size, 0);
+
+ /*
+ * Map two pages:
+ * - First page: readable and writable
+ * - Second page: will be unmapped to trigger EFAULT
+ */
+ map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ /* Unmap the second page */
+ ret = munmap((char *)map + page_size, page_size);
+ ASSERT_EQ(ret, 0);
+
+ /*
+ * Position the buffer pointer so there's room for exactly one u64
+ * before the page boundary. The second u64 would fall into the
+ * unmapped page.
+ */
+ ns_ids = ((__u64 *)((char *)map + page_size)) - 1;
+
+ /*
+ * Create a separate process to run listns() in a loop concurrently
+ * with namespace creation and destruction.
+ */
+ iter_pid = create_child(&iter_pidfd, 0);
+ ASSERT_NE(iter_pid, -1);
+
+ if (iter_pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0, /* Global listing */
+ };
+ int iter_ret;
+
+ /*
+ * Loop calling listns() until killed.
+ * The kernel should:
+ * 1. Successfully write the first namespace ID (within valid page)
+ * 2. Fail with EFAULT when trying to write the second ID (unmapped page)
+ * 3. Handle concurrent namespace destruction without deadlock
+ */
+ while (1) {
+ iter_ret = sys_listns(&req, ns_ids, 2, 0);
+
+ if (iter_ret == -1 && errno == ENOSYS)
+ _exit(PIDFD_SKIP);
+ }
+ }
+
+ /* Small delay to let iterator start looping */
+ usleep(50000);
+
+ /*
+ * Create several child processes, each in its own mount namespace.
+ * These will be destroyed while the iterator is running listns().
+ */
+ for (i = 0; i < 5; i++) {
+ /* Create socketpair for synchronization */
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+ pid = create_child(&pidfds[i], CLONE_NEWNS);
+ ASSERT_NE(pid, -1);
+
+ if (pid == 0) {
+ close(sv[i][0]); /* Close parent end */
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ _exit(1);
+
+ /* Child: create a couple of tmpfs mounts */
+ if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+ if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+
+ if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+ if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+
+ /* Signal parent that setup is complete */
+ if (write_nointr(sv[i][1], "R", 1) != 1)
+ _exit(1);
+
+ /* Wait for parent to signal us to exit */
+ if (read_nointr(sv[i][1], &c, 1) != 1)
+ _exit(1);
+
+ close(sv[i][1]);
+ _exit(0);
+ }
+
+ close(sv[i][1]); /* Close child end */
+ }
+
+ /* Wait for all children to finish setup */
+ for (i = 0; i < 5; i++) {
+ ret = read_nointr(sv[i][0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ ASSERT_EQ(c, 'R');
+ }
+
+ /*
+ * Signal children to exit. This will destroy their mount namespaces
+ * while listns() is iterating the namespace tree.
+ * This tests that cleanup happens outside the RCU read lock.
+ */
+ for (i = 0; i < 5; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Wait for all mount namespace children to exit and cleanup */
+ for (i = 0; i < 5; i++) {
+ waitpid(-1, NULL, 0);
+ close(sv[i][0]);
+ close(pidfds[i]);
+ }
+
+ /* Kill iterator and wait for it */
+ sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+ ret = waitpid(iter_pid, &status, 0);
+ ASSERT_EQ(ret, iter_pid);
+ close(iter_pidfd);
+
+ /* Should have been killed */
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ /* Clean up */
+ munmap(map, page_size);
+}
+
+/*
+ * Test listns() error handling when the entire buffer is invalid.
+ * This is a sanity check that basic invalid pointer detection works.
+ */
+TEST(listns_complete_fault)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 *ns_ids;
+ ssize_t ret;
+
+ /* Use a clearly invalid pointer */
+ ns_ids = (__u64 *)0xdeadbeef;
+
+ ret = sys_listns(&req, ns_ids, 10, 0);
+
+ if (ret == -1 && errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+
+ /* Should fail with EFAULT */
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() error handling when the buffer is NULL.
+ */
+TEST(listns_null_buffer)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ ssize_t ret;
+
+ /* NULL buffer with non-zero count should fail */
+ ret = sys_listns(&req, NULL, 10, 0);
+
+ if (ret == -1 && errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+
+ /* Should fail with EFAULT */
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() with a buffer that becomes invalid mid-iteration
+ * (after several successful writes), combined with mount namespace
+ * destruction to test RCU cleanup logic.
+ */
+TEST(listns_late_fault_with_ns_cleanup)
+{
+ void *map;
+ __u64 *ns_ids;
+ ssize_t ret;
+ long page_size;
+ pid_t pid, iter_pid;
+ int pidfds[10];
+ int sv[10][2];
+ int iter_pidfd;
+ int i, status;
+ char c;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ ASSERT_GT(page_size, 0);
+
+ /* Map two pages */
+ map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ /* Unmap the second page */
+ ret = munmap((char *)map + page_size, page_size);
+ ASSERT_EQ(ret, 0);
+
+ /*
+ * Position buffer so we can write several u64s successfully
+ * before hitting the page boundary.
+ */
+ ns_ids = ((__u64 *)((char *)map + page_size)) - 5;
+
+ /*
+ * Create a separate process to run listns() concurrently.
+ */
+ iter_pid = create_child(&iter_pidfd, 0);
+ ASSERT_NE(iter_pid, -1);
+
+ if (iter_pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ int iter_ret;
+
+ /*
+ * Loop calling listns() until killed.
+ * Request 10 namespace IDs while namespaces are being destroyed.
+ * This tests:
+ * 1. EFAULT handling when buffer becomes invalid
+ * 2. Namespace cleanup outside RCU read lock during iteration
+ */
+ while (1) {
+ iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+ if (iter_ret == -1 && errno == ENOSYS)
+ _exit(PIDFD_SKIP);
+ }
+ }
+
+ /* Small delay to let iterator start looping */
+ usleep(50000);
+
+ /*
+ * Create more children with mount namespaces to increase the
+ * likelihood that namespace cleanup happens during iteration.
+ */
+ for (i = 0; i < 10; i++) {
+ /* Create socketpair for synchronization */
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+ pid = create_child(&pidfds[i], CLONE_NEWNS);
+ ASSERT_NE(pid, -1);
+
+ if (pid == 0) {
+ close(sv[i][0]); /* Close parent end */
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ _exit(1);
+
+ /* Child: create tmpfs mounts */
+ if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+ if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+
+ if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+ if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+
+ /* Signal parent that setup is complete */
+ if (write_nointr(sv[i][1], "R", 1) != 1)
+ _exit(1);
+
+ /* Wait for parent to signal us to exit */
+ if (read_nointr(sv[i][1], &c, 1) != 1)
+ _exit(1);
+
+ close(sv[i][1]);
+ _exit(0);
+ }
+
+ close(sv[i][1]); /* Close child end */
+ }
+
+ /* Wait for all children to finish setup */
+ for (i = 0; i < 10; i++) {
+ ret = read_nointr(sv[i][0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ ASSERT_EQ(c, 'R');
+ }
+
+ /* Kill half the children */
+ for (i = 0; i < 5; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Small delay to let some exit */
+ usleep(10000);
+
+ /* Kill remaining children */
+ for (i = 5; i < 10; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Wait for all children and cleanup */
+ for (i = 0; i < 10; i++) {
+ waitpid(-1, NULL, 0);
+ close(sv[i][0]);
+ close(pidfds[i]);
+ }
+
+ /* Kill iterator and wait for it */
+ sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+ ret = waitpid(iter_pid, &status, 0);
+ ASSERT_EQ(ret, iter_pid);
+ close(iter_pidfd);
+
+ /* Should have been killed */
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ /* Clean up */
+ munmap(map, page_size);
+}
+
+/*
+ * Test specifically focused on mount namespace cleanup during EFAULT.
+ * Filter for mount namespaces only.
+ */
+TEST(listns_mnt_ns_cleanup_on_fault)
+{
+ void *map;
+ __u64 *ns_ids;
+ ssize_t ret;
+ long page_size;
+ pid_t pid, iter_pid;
+ int pidfds[8];
+ int sv[8][2];
+ int iter_pidfd;
+ int i, status;
+ char c;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ ASSERT_GT(page_size, 0);
+
+ /* Set up partial fault buffer */
+ map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ ret = munmap((char *)map + page_size, page_size);
+ ASSERT_EQ(ret, 0);
+
+ /* Position for 3 successful writes, then fault */
+ ns_ids = ((__u64 *)((char *)map + page_size)) - 3;
+
+ /*
+ * Create a separate process to run listns() concurrently.
+ */
+ iter_pid = create_child(&iter_pidfd, 0);
+ ASSERT_NE(iter_pid, -1);
+
+ if (iter_pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNS, /* Only mount namespaces */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ int iter_ret;
+
+ /*
+ * Loop calling listns() until killed.
+ * Call listns() to race with namespace destruction.
+ */
+ while (1) {
+ iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+ if (iter_ret == -1 && errno == ENOSYS)
+ _exit(PIDFD_SKIP);
+ }
+ }
+
+ /* Small delay to let iterator start looping */
+ usleep(50000);
+
+ /* Create children with mount namespaces */
+ for (i = 0; i < 8; i++) {
+ /* Create socketpair for synchronization */
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+ pid = create_child(&pidfds[i], CLONE_NEWNS);
+ ASSERT_NE(pid, -1);
+
+ if (pid == 0) {
+ close(sv[i][0]); /* Close parent end */
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ _exit(1);
+
+ /* Do some mount operations to make cleanup more interesting */
+ if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+ if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+
+ if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+ if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+
+ /* Signal parent that setup is complete */
+ if (write_nointr(sv[i][1], "R", 1) != 1)
+ _exit(1);
+
+ /* Wait for parent to signal us to exit */
+ if (read_nointr(sv[i][1], &c, 1) != 1)
+ _exit(1);
+
+ close(sv[i][1]);
+ _exit(0);
+ }
+
+ close(sv[i][1]); /* Close child end */
+ }
+
+ /* Wait for all children to finish setup */
+ for (i = 0; i < 8; i++) {
+ ret = read_nointr(sv[i][0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ ASSERT_EQ(c, 'R');
+ }
+
+ /* Kill children to trigger namespace destruction during iteration */
+ for (i = 0; i < 8; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Wait for children and cleanup */
+ for (i = 0; i < 8; i++) {
+ waitpid(-1, NULL, 0);
+ close(sv[i][0]);
+ close(pidfds[i]);
+ }
+
+ /* Kill iterator and wait for it */
+ sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+ ret = waitpid(iter_pid, &status, 0);
+ ASSERT_EQ(ret, iter_pid);
+ close(iter_pidfd);
+
+ /* Should have been killed */
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ munmap(map, page_size);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_pagination_bug.c b/tools/testing/selftests/namespaces/listns_pagination_bug.c
new file mode 100644
index 000000000000..da7d33f96397
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_pagination_bug.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Minimal test case to reproduce KASAN out-of-bounds in listns pagination.
+ *
+ * The bug occurs when:
+ * 1. Filtering by a specific namespace type (e.g., CLONE_NEWUSER)
+ * 2. Using pagination (req.ns_id != 0)
+ * 3. The lookup_ns_id_at() call in do_listns() passes ns_type=0 instead of
+ * the filtered type, causing it to search the unified tree and potentially
+ * return a namespace of the wrong type.
+ */
+TEST(pagination_with_type_filter)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER, /* Filter by user namespace */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ pid_t pids[10];
+ int num_children = 10;
+ int i;
+ int sv[2];
+ __u64 first_batch[3];
+ ssize_t ret;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create children with user namespaces */
+ for (i = 0; i < num_children; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ char c;
+ close(sv[0]);
+
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Signal parent we're ready */
+ if (write(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal to exit */
+ if (read(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ close(sv[1]);
+ exit(0);
+ }
+ }
+
+ close(sv[1]);
+
+ /* Wait for all children to signal ready */
+ for (i = 0; i < num_children; i++) {
+ char c;
+ if (read(sv[0], &c, 1) != 1) {
+ close(sv[0]);
+ for (int j = 0; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ /* First batch - this should work */
+ ret = sys_listns(&req, first_batch, 3, 0);
+ if (ret < 0) {
+ if (errno == ENOSYS) {
+ close(sv[0]);
+ for (i = 0; i < num_children; i++)
+ kill(pids[i], SIGKILL);
+ for (i = 0; i < num_children; i++)
+ waitpid(pids[i], NULL, 0);
+ SKIP(return, "listns() not supported");
+ }
+ ASSERT_GE(ret, 0);
+ }
+
+ TH_LOG("First batch returned %zd entries", ret);
+
+ if (ret == 3) {
+ __u64 second_batch[3];
+
+ /* Second batch - pagination triggers the bug */
+ req.ns_id = first_batch[2]; /* Continue from last ID */
+ ret = sys_listns(&req, second_batch, 3, 0);
+
+ TH_LOG("Second batch returned %zd entries", ret);
+ ASSERT_GE(ret, 0);
+ }
+
+ /* Signal all children to exit */
+ for (i = 0; i < num_children; i++) {
+ char c = 'X';
+ if (write(sv[0], &c, 1) != 1) {
+ close(sv[0]);
+ for (int j = i; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ close(sv[0]);
+
+ /* Cleanup */
+ for (i = 0; i < num_children; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_permissions_test.c b/tools/testing/selftests/namespaces/listns_permissions_test.c
new file mode 100644
index 000000000000..82d818751a5f
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_permissions_test.c
@@ -0,0 +1,759 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/capability.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Test that unprivileged users can only see namespaces they're currently in.
+ * Create a namespace, drop privileges, verify we can only see our own namespaces.
+ */
+TEST(listns_unprivileged_current_only)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool found_ours;
+ int unexpected_count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 our_netns_id;
+ bool found_ours;
+ int unexpected_count;
+
+ close(pipefd[0]);
+
+ /* Create user namespace to be unprivileged */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Create a network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get our network namespace ID */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &our_netns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Now we're unprivileged - list all network namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* We should only see our own network namespace */
+ found_ours = false;
+ unexpected_count = 0;
+
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == our_netns_id) {
+ found_ours = true;
+ } else {
+ /* This is either init_net (which we can see) or unexpected */
+ unexpected_count++;
+ }
+ }
+
+ /* Send results to parent */
+ write(pipefd[1], &found_ours, sizeof(found_ours));
+ write(pipefd[1], &unexpected_count, sizeof(unexpected_count));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ found_ours = false;
+ unexpected_count = 0;
+ read(pipefd[0], &found_ours, sizeof(found_ours));
+ read(pipefd[0], &unexpected_count, sizeof(unexpected_count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Child should have seen its own namespace */
+ ASSERT_TRUE(found_ours);
+
+ TH_LOG("Unprivileged child saw its own namespace, plus %d others (likely init_net)",
+ unexpected_count);
+}
+
+/*
+ * Test that users with CAP_SYS_ADMIN in a user namespace can see
+ * all namespaces owned by that user namespace.
+ */
+TEST(listns_cap_sys_admin_in_userns)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0, /* Will be set to our created user namespace */
+ };
+ __u64 ns_ids[100];
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool success;
+ ssize_t count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 userns_id;
+ ssize_t ret;
+ int min_expected;
+ bool success;
+
+ close(pipefd[0]);
+
+ /* Create user namespace - we'll have CAP_SYS_ADMIN in it */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get the user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create several namespaces owned by this user namespace */
+ unshare(CLONE_NEWNET);
+ unshare(CLONE_NEWUTS);
+ unshare(CLONE_NEWIPC);
+
+ /* List namespaces owned by our user namespace */
+ req.user_ns_id = userns_id;
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /*
+ * We have CAP_SYS_ADMIN in this user namespace,
+ * so we should see all namespaces owned by it.
+ * That includes: net, uts, ipc, and the user namespace itself.
+ */
+ min_expected = 4;
+ success = (ret >= min_expected);
+
+ write(pipefd[1], &success, sizeof(success));
+ write(pipefd[1], &ret, sizeof(ret));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ success = false;
+ count = 0;
+ read(pipefd[0], &success, sizeof(success));
+ read(pipefd[0], &count, sizeof(count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(success);
+ TH_LOG("User with CAP_SYS_ADMIN saw %zd namespaces owned by their user namespace",
+ count);
+}
+
+/*
+ * Test that users cannot see namespaces from unrelated user namespaces.
+ * Create two sibling user namespaces, verify they can't see each other's
+ * owned namespaces.
+ */
+TEST(listns_cannot_see_sibling_userns_namespaces)
+{
+ int pipefd[2];
+ pid_t pid1, pid2;
+ int status;
+ __u64 netns_a_id;
+ int pipefd2[2];
+ bool found_sibling_netns;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ /* Fork first child - creates user namespace A */
+ pid1 = fork();
+ ASSERT_GE(pid1, 0);
+
+ if (pid1 == 0) {
+ int fd;
+ __u64 netns_a_id;
+ char buf;
+
+ close(pipefd[0]);
+
+ /* Create user namespace A */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Create network namespace owned by user namespace A */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get network namespace ID */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &netns_a_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send namespace ID to parent */
+ write(pipefd[1], &netns_a_id, sizeof(netns_a_id));
+
+ /* Keep alive for sibling to check */
+ read(pipefd[1], &buf, 1);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent reads namespace A ID */
+ close(pipefd[1]);
+ netns_a_id = 0;
+ read(pipefd[0], &netns_a_id, sizeof(netns_a_id));
+
+ TH_LOG("User namespace A created network namespace with ID %llu",
+ (unsigned long long)netns_a_id);
+
+ /* Fork second child - creates user namespace B */
+ ASSERT_EQ(pipe(pipefd2), 0);
+
+ pid2 = fork();
+ ASSERT_GE(pid2, 0);
+
+ if (pid2 == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_sibling_netns;
+
+ close(pipefd[0]);
+ close(pipefd2[0]);
+
+ /* Create user namespace B (sibling to A) */
+ if (setup_userns() < 0) {
+ close(pipefd2[1]);
+ exit(1);
+ }
+
+ /* Try to list all network namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ found_sibling_netns = false;
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_a_id) {
+ found_sibling_netns = true;
+ break;
+ }
+ }
+ }
+
+ /* We should NOT see the sibling's network namespace */
+ write(pipefd2[1], &found_sibling_netns, sizeof(found_sibling_netns));
+ close(pipefd2[1]);
+ exit(0);
+ }
+
+ /* Parent reads result from second child */
+ close(pipefd2[1]);
+ found_sibling_netns = false;
+ read(pipefd2[0], &found_sibling_netns, sizeof(found_sibling_netns));
+ close(pipefd2[0]);
+
+ /* Signal first child to exit */
+ close(pipefd[0]);
+
+ /* Wait for both children */
+ waitpid(pid2, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ waitpid(pid1, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ /* Second child should NOT have seen first child's namespace */
+ ASSERT_FALSE(found_sibling_netns);
+ TH_LOG("User namespace B correctly could not see sibling namespace A's network namespace");
+}
+
+/*
+ * Test permission checking with LISTNS_CURRENT_USER.
+ * Verify that listing with LISTNS_CURRENT_USER respects permissions.
+ */
+TEST(listns_current_user_permissions)
+{
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool success;
+ ssize_t count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = LISTNS_CURRENT_USER,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool success;
+
+ close(pipefd[0]);
+
+ /* Create user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Create some namespaces owned by this user namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (unshare(CLONE_NEWUTS) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* List with LISTNS_CURRENT_USER - should see our owned namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ success = (ret >= 3); /* At least user, net, uts */
+ write(pipefd[1], &success, sizeof(success));
+ write(pipefd[1], &ret, sizeof(ret));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ success = false;
+ count = 0;
+ read(pipefd[0], &success, sizeof(success));
+ read(pipefd[0], &count, sizeof(count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(success);
+ TH_LOG("LISTNS_CURRENT_USER returned %zd namespaces", count);
+}
+
+/*
+ * Test that CAP_SYS_ADMIN in parent user namespace allows seeing
+ * child user namespace's owned namespaces.
+ */
+TEST(listns_parent_userns_cap_sys_admin)
+{
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool found_child_userns;
+ ssize_t count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 parent_userns_id;
+ __u64 child_userns_id;
+ struct ns_id_req req;
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_child_userns;
+
+ close(pipefd[0]);
+
+ /* Create parent user namespace - we have CAP_SYS_ADMIN in it */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get parent user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &parent_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create child user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get child user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create namespaces owned by child user namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* List namespaces owned by parent user namespace */
+ req.size = sizeof(req);
+ req.spare = 0;
+ req.ns_id = 0;
+ req.ns_type = 0;
+ req.spare2 = 0;
+ req.user_ns_id = parent_userns_id;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ /* Should see child user namespace in the list */
+ found_child_userns = false;
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == child_userns_id) {
+ found_child_userns = true;
+ break;
+ }
+ }
+ }
+
+ write(pipefd[1], &found_child_userns, sizeof(found_child_userns));
+ write(pipefd[1], &ret, sizeof(ret));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ found_child_userns = false;
+ count = 0;
+ read(pipefd[0], &found_child_userns, sizeof(found_child_userns));
+ read(pipefd[0], &count, sizeof(count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(found_child_userns);
+ TH_LOG("Process with CAP_SYS_ADMIN in parent user namespace saw child user namespace (total: %zd)",
+ count);
+}
+
+/*
+ * Test that we can see user namespaces we have CAP_SYS_ADMIN inside of.
+ * This is different from seeing namespaces owned by a user namespace.
+ */
+TEST(listns_cap_sys_admin_inside_userns)
+{
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool found_ours;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 our_userns_id;
+ struct ns_id_req req;
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_ours;
+
+ close(pipefd[0]);
+
+ /* Create user namespace - we have CAP_SYS_ADMIN inside it */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get our user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &our_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* List all user namespaces globally */
+ req.size = sizeof(req);
+ req.spare = 0;
+ req.ns_id = 0;
+ req.ns_type = CLONE_NEWUSER;
+ req.spare2 = 0;
+ req.user_ns_id = 0;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ /* We should be able to see our own user namespace */
+ found_ours = false;
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == our_userns_id) {
+ found_ours = true;
+ break;
+ }
+ }
+ }
+
+ write(pipefd[1], &found_ours, sizeof(found_ours));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ found_ours = false;
+ read(pipefd[0], &found_ours, sizeof(found_ours));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(found_ours);
+ TH_LOG("Process can see user namespace it has CAP_SYS_ADMIN inside of");
+}
+
+/*
+ * Test that dropping CAP_SYS_ADMIN restricts what we can see.
+ */
+TEST(listns_drop_cap_sys_admin)
+{
+ cap_t caps;
+ cap_value_t cap_list[1] = { CAP_SYS_ADMIN };
+
+ /* This test needs to start with CAP_SYS_ADMIN */
+ caps = cap_get_proc();
+ if (!caps) {
+ SKIP(return, "Cannot get capabilities");
+ }
+
+ cap_flag_value_t cap_val;
+ if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &cap_val) < 0) {
+ cap_free(caps);
+ SKIP(return, "Cannot check CAP_SYS_ADMIN");
+ }
+
+ if (cap_val != CAP_SET) {
+ cap_free(caps);
+ SKIP(return, "Test needs CAP_SYS_ADMIN to start");
+ }
+ cap_free(caps);
+
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool correct;
+ ssize_t count_before, count_after;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = LISTNS_CURRENT_USER,
+ };
+ __u64 ns_ids_before[100];
+ ssize_t count_before;
+ __u64 ns_ids_after[100];
+ ssize_t count_after;
+ bool correct;
+
+ close(pipefd[0]);
+
+ /* Create user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Count namespaces with CAP_SYS_ADMIN */
+ count_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+
+ /* Drop CAP_SYS_ADMIN */
+ caps = cap_get_proc();
+ if (caps) {
+ cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR);
+ cap_set_flag(caps, CAP_PERMITTED, 1, cap_list, CAP_CLEAR);
+ cap_set_proc(caps);
+ cap_free(caps);
+ }
+
+ /* Ensure we can't regain the capability */
+ prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+
+ /* Count namespaces without CAP_SYS_ADMIN */
+ count_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+
+ /* Without CAP_SYS_ADMIN, we should see same or fewer namespaces */
+ correct = (count_after <= count_before);
+
+ write(pipefd[1], &correct, sizeof(correct));
+ write(pipefd[1], &count_before, sizeof(count_before));
+ write(pipefd[1], &count_after, sizeof(count_after));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ correct = false;
+ count_before = 0;
+ count_after = 0;
+ read(pipefd[0], &correct, sizeof(correct));
+ read(pipefd[0], &count_before, sizeof(count_before));
+ read(pipefd[0], &count_after, sizeof(count_after));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(correct);
+ TH_LOG("With CAP_SYS_ADMIN: %zd namespaces, without: %zd namespaces",
+ count_before, count_after);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_test.c b/tools/testing/selftests/namespaces/listns_test.c
new file mode 100644
index 000000000000..8a95789d6a87
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_test.c
@@ -0,0 +1,679 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Test basic listns() functionality with the unified namespace tree.
+ * List all active namespaces globally.
+ */
+TEST(listns_basic_unified)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0, /* Global listing */
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+
+ /* Should find at least the initial namespaces */
+ ASSERT_GT(ret, 0);
+ TH_LOG("Found %zd active namespaces", ret);
+
+ /* Verify all returned IDs are non-zero */
+ for (ssize_t i = 0; i < ret; i++) {
+ ASSERT_NE(ns_ids[i], 0);
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+ }
+}
+
+/*
+ * Test listns() with type filtering.
+ * List only network namespaces.
+ */
+TEST(listns_filter_by_type)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET, /* Only network namespaces */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret, 0);
+
+ /* Should find at least init_net */
+ ASSERT_GT(ret, 0);
+ TH_LOG("Found %zd active network namespaces", ret);
+
+ /* Verify we can open each namespace and it's actually a network namespace */
+ for (ssize_t i = 0; i < ret && i < 5; i++) {
+ struct nsfs_file_handle nsfh = {
+ .ns_id = ns_ids[i],
+ .ns_type = CLONE_NEWNET,
+ .ns_inum = 0,
+ };
+ struct file_handle *fh;
+ int fd;
+
+ fh = (struct file_handle *)malloc(sizeof(*fh) + sizeof(nsfh));
+ ASSERT_NE(fh, NULL);
+ fh->handle_bytes = sizeof(nsfh);
+ fh->handle_type = 0;
+ memcpy(fh->f_handle, &nsfh, sizeof(nsfh));
+
+ fd = open_by_handle_at(-10003, fh, O_RDONLY);
+ free(fh);
+
+ if (fd >= 0) {
+ int ns_type;
+ /* Verify it's a network namespace via ioctl */
+ ns_type = ioctl(fd, NS_GET_NSTYPE);
+ if (ns_type >= 0) {
+ ASSERT_EQ(ns_type, CLONE_NEWNET);
+ }
+ close(fd);
+ }
+ }
+}
+
+/*
+ * Test listns() pagination.
+ * List namespaces in batches.
+ */
+TEST(listns_pagination)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 batch1[2], batch2[2];
+ ssize_t ret1, ret2;
+
+ /* Get first batch */
+ ret1 = sys_listns(&req, batch1, ARRAY_SIZE(batch1), 0);
+ if (ret1 < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret1, 0);
+
+ if (ret1 == 0)
+ SKIP(return, "No namespaces found");
+
+ TH_LOG("First batch: %zd namespaces", ret1);
+
+ /* Get second batch using last ID from first batch */
+ if (ret1 == ARRAY_SIZE(batch1)) {
+ req.ns_id = batch1[ret1 - 1];
+ ret2 = sys_listns(&req, batch2, ARRAY_SIZE(batch2), 0);
+ ASSERT_GE(ret2, 0);
+
+ TH_LOG("Second batch: %zd namespaces (after ns_id=%llu)",
+ ret2, (unsigned long long)req.ns_id);
+
+ /* If we got more results, verify IDs are monotonically increasing */
+ if (ret2 > 0) {
+ ASSERT_GT(batch2[0], batch1[ret1 - 1]);
+ TH_LOG("Pagination working: %llu > %llu",
+ (unsigned long long)batch2[0],
+ (unsigned long long)batch1[ret1 - 1]);
+ }
+ } else {
+ TH_LOG("All namespaces fit in first batch");
+ }
+}
+
+/*
+ * Test listns() with LISTNS_CURRENT_USER.
+ * List namespaces owned by current user namespace.
+ */
+TEST(listns_current_user)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = LISTNS_CURRENT_USER,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret, 0);
+
+ /* Should find at least the initial namespaces if we're in init_user_ns */
+ TH_LOG("Found %zd namespaces owned by current user namespace", ret);
+
+ for (ssize_t i = 0; i < ret; i++)
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+}
+
+/*
+ * Test that listns() only returns active namespaces.
+ * Create a namespace, let it become inactive, verify it's not listed.
+ */
+TEST(listns_only_active)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[100], ns_ids_after[100];
+ ssize_t ret_before, ret_after;
+ int pipefd[2];
+ pid_t pid;
+ __u64 new_ns_id = 0;
+ int status;
+
+ /* Get initial list */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret_before, 0);
+
+ TH_LOG("Before: %zd active network namespaces", ret_before);
+
+ /* Create a new namespace in a child process and get its ID */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 ns_id;
+
+ close(pipefd[0]);
+
+ /* Create new network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get its ID */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &ns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send ID to parent */
+ write(pipefd[1], &ns_id, sizeof(ns_id));
+ close(pipefd[1]);
+
+ /* Keep namespace active briefly */
+ usleep(100000);
+ exit(0);
+ }
+
+ /* Parent reads the new namespace ID */
+ {
+ int bytes;
+
+ close(pipefd[1]);
+ bytes = read(pipefd[0], &new_ns_id, sizeof(new_ns_id));
+ close(pipefd[0]);
+
+ if (bytes == sizeof(new_ns_id)) {
+ __u64 ns_ids_during[100];
+ int ret_during;
+
+ TH_LOG("Child created namespace with ID %llu", (unsigned long long)new_ns_id);
+
+ /* List namespaces while child is still alive - should see new one */
+ ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0);
+ ASSERT_GE(ret_during, 0);
+ TH_LOG("During: %d active network namespaces", ret_during);
+
+ /* Should have more namespaces than before */
+ ASSERT_GE(ret_during, ret_before);
+ }
+ }
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+
+ /* Give time for namespace to become inactive */
+ usleep(100000);
+
+ /* List namespaces after child exits - should not see new one */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+ TH_LOG("After: %zd active network namespaces", ret_after);
+
+ /* Verify the new namespace ID is not in the after list */
+ if (new_ns_id != 0) {
+ bool found = false;
+
+ for (ssize_t i = 0; i < ret_after; i++) {
+ if (ns_ids_after[i] == new_ns_id) {
+ found = true;
+ break;
+ }
+ }
+ ASSERT_FALSE(found);
+ }
+}
+
+/*
+ * Test listns() with specific user namespace ID.
+ * Create a user namespace and list namespaces it owns.
+ */
+TEST(listns_specific_userns)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0, /* Will be filled with created userns ID */
+ };
+ __u64 ns_ids[100];
+ int sv[2];
+ pid_t pid;
+ int status;
+ __u64 user_ns_id = 0;
+ int bytes;
+ ssize_t ret;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 ns_id;
+ char buf;
+
+ close(sv[0]);
+
+ /* Create new user namespace */
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &ns_id) < 0) {
+ close(fd);
+ close(sv[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send ID to parent */
+ if (write(sv[1], &ns_id, sizeof(ns_id)) != sizeof(ns_id)) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Create some namespaces owned by this user namespace */
+ unshare(CLONE_NEWNET);
+ unshare(CLONE_NEWUTS);
+
+ /* Wait for parent signal */
+ if (read(sv[1], &buf, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+ close(sv[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(sv[1]);
+ bytes = read(sv[0], &user_ns_id, sizeof(user_ns_id));
+
+ if (bytes != sizeof(user_ns_id)) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get user namespace ID from child");
+ }
+
+ TH_LOG("Child created user namespace with ID %llu", (unsigned long long)user_ns_id);
+
+ /* List namespaces owned by this user namespace */
+ req.user_ns_id = user_ns_id;
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ if (ret < 0) {
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOSYS) {
+ SKIP(return, "listns() not supported");
+ }
+ ASSERT_GE(ret, 0);
+ }
+
+ TH_LOG("Found %zd namespaces owned by user namespace %llu", ret,
+ (unsigned long long)user_ns_id);
+
+ /* Should find at least the network and UTS namespaces we created */
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret && i < 10; i++)
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+ }
+
+ /* Signal child to exit */
+ if (write(sv[0], "X", 1) != 1) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ close(sv[0]);
+ waitpid(pid, &status, 0);
+}
+
+/*
+ * Test listns() with multiple namespace types filter.
+ */
+TEST(listns_multiple_types)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET | CLONE_NEWUTS, /* Network and UTS */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret, 0);
+
+ TH_LOG("Found %zd active network/UTS namespaces", ret);
+
+ for (ssize_t i = 0; i < ret; i++)
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+}
+
+/*
+ * Test that hierarchical active reference propagation keeps parent
+ * user namespaces visible in listns().
+ */
+TEST(listns_hierarchical_visibility)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 parent_ns_id = 0, child_ns_id = 0;
+ int sv[2];
+ pid_t pid;
+ int status;
+ int bytes;
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_parent, found_child;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ char buf;
+
+ close(sv[0]);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &parent_ns_id) < 0) {
+ close(fd);
+ close(sv[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create child user namespace */
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_ns_id) < 0) {
+ close(fd);
+ close(sv[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send both IDs to parent */
+ if (write(sv[1], &parent_ns_id, sizeof(parent_ns_id)) != sizeof(parent_ns_id)) {
+ close(sv[1]);
+ exit(1);
+ }
+ if (write(sv[1], &child_ns_id, sizeof(child_ns_id)) != sizeof(child_ns_id)) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal */
+ if (read(sv[1], &buf, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+ close(sv[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(sv[1]);
+
+ /* Read both namespace IDs */
+ bytes = read(sv[0], &parent_ns_id, sizeof(parent_ns_id));
+ bytes += read(sv[0], &child_ns_id, sizeof(child_ns_id));
+
+ if (bytes != (int)(2 * sizeof(__u64))) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace IDs from child");
+ }
+
+ TH_LOG("Parent user namespace ID: %llu", (unsigned long long)parent_ns_id);
+ TH_LOG("Child user namespace ID: %llu", (unsigned long long)child_ns_id);
+
+ /* List all user namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ if (ret < 0 && errno == ENOSYS) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "listns() not supported");
+ }
+
+ ASSERT_GE(ret, 0);
+ TH_LOG("Found %zd active user namespaces", ret);
+
+ /* Both parent and child should be visible (active due to child process) */
+ found_parent = false;
+ found_child = false;
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == parent_ns_id)
+ found_parent = true;
+ if (ns_ids[i] == child_ns_id)
+ found_child = true;
+ }
+
+ TH_LOG("Parent namespace %s, child namespace %s",
+ found_parent ? "found" : "NOT FOUND",
+ found_child ? "found" : "NOT FOUND");
+
+ ASSERT_TRUE(found_child);
+ /* With hierarchical propagation, parent should also be active */
+ ASSERT_TRUE(found_parent);
+
+ /* Signal child to exit */
+ if (write(sv[0], "X", 1) != 1) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ close(sv[0]);
+ waitpid(pid, &status, 0);
+}
+
+/*
+ * Test error cases for listns().
+ */
+TEST(listns_error_cases)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[10];
+ int ret;
+
+ /* Test with invalid flags */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0xFFFF);
+ if (errno == ENOSYS) {
+ /* listns() not supported, skip this check */
+ } else {
+ ASSERT_LT(ret, 0);
+ ASSERT_EQ(errno, EINVAL);
+ }
+
+ /* Test with NULL ns_ids array */
+ ret = sys_listns(&req, NULL, 10, 0);
+ ASSERT_LT(ret, 0);
+
+ /* Test with invalid spare field */
+ req.spare = 1;
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (errno == ENOSYS) {
+ /* listns() not supported, skip this check */
+ } else {
+ ASSERT_LT(ret, 0);
+ ASSERT_EQ(errno, EINVAL);
+ }
+ req.spare = 0;
+
+ /* Test with huge nr_ns_ids */
+ ret = sys_listns(&req, ns_ids, 2000000, 0);
+ if (errno == ENOSYS) {
+ /* listns() not supported, skip this check */
+ } else {
+ ASSERT_LT(ret, 0);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/ns_active_ref_test.c b/tools/testing/selftests/namespaces/ns_active_ref_test.c
new file mode 100644
index 000000000000..093268f0efaa
--- /dev/null
+++ b/tools/testing/selftests/namespaces/ns_active_ref_test.c
@@ -0,0 +1,2672 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <pthread.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+#ifndef FD_NSFS_ROOT
+#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */
+#endif
+
+#ifndef FILEID_NSFS
+#define FILEID_NSFS 0xf1
+#endif
+
+/*
+ * Test that initial namespaces can be reopened via file handle.
+ * Initial namespaces should have active ref count of 1 from boot.
+ */
+TEST(init_ns_always_active)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd1, fd2;
+ struct stat st1, st2;
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Open initial network namespace */
+ fd1 = open("/proc/1/ns/net", O_RDONLY);
+ ASSERT_GE(fd1, 0);
+
+ /* Get file handle for initial namespace */
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd1, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(fd1);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+
+ /* Close the namespace fd */
+ close(fd1);
+
+ /* Try to reopen via file handle - should succeed since init ns is always active */
+ fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd2 < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+ SKIP(free(handle);
+ return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ }
+ ASSERT_GE(fd2, 0);
+
+ /* Verify we opened the same namespace */
+ fd1 = open("/proc/1/ns/net", O_RDONLY);
+ ASSERT_GE(fd1, 0);
+ ASSERT_EQ(fstat(fd1, &st1), 0);
+ ASSERT_EQ(fstat(fd2, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ close(fd1);
+ close(fd2);
+ free(handle);
+}
+
+/*
+ * Test namespace lifecycle: create a namespace in a child process,
+ * get a file handle while it's active, then try to reopen after
+ * the process exits (namespace becomes inactive).
+ */
+TEST(ns_inactive_after_exit)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ /* Create pipe for passing file handle from child */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Open our new namespace */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get file handle for the namespace */
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+
+ /* Exit - namespace should become inactive */
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ /* Read file handle from child */
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /* Try to reopen namespace - should fail with ENOENT since it's inactive */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ /* Should fail with ENOENT (namespace inactive) or ESTALE */
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that a namespace remains active while a process is using it,
+ * even after the creating process exits.
+ */
+TEST(ns_active_with_multiple_processes)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ int syncpipe[2];
+ pid_t pid1, pid2;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+
+ /* Create pipes for communication */
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ pid1 = fork();
+ ASSERT_GE(pid1, 0);
+
+ if (pid1 == 0) {
+ /* First child - creates namespace */
+ close(pipefd[0]);
+ close(syncpipe[1]);
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ /* Open and get handle */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+
+ /* Wait for signal before exiting */
+ read(syncpipe[0], &sync_byte, 1);
+ close(syncpipe[0]);
+ exit(0);
+ }
+
+ /* Parent reads handle */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+ ASSERT_GT(ret, 0);
+
+ handle = (struct file_handle *)buf;
+
+ /* Create second child that will keep namespace active */
+ pid2 = fork();
+ ASSERT_GE(pid2, 0);
+
+ if (pid2 == 0) {
+ /* Second child - reopens the namespace */
+ close(syncpipe[0]);
+ close(syncpipe[1]);
+
+ /* Open the namespace via handle */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd < 0) {
+ exit(1);
+ }
+
+ /* Join the namespace */
+ ret = setns(fd, CLONE_NEWNET);
+ close(fd);
+ if (ret < 0) {
+ exit(1);
+ }
+
+ /* Sleep to keep namespace active */
+ sleep(1);
+ exit(0);
+ }
+
+ /* Let second child enter the namespace */
+ usleep(100000); /* 100ms */
+
+ /* Signal first child to exit */
+ close(syncpipe[0]);
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ /* Wait for first child */
+ waitpid(pid1, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ /* Namespace should still be active because second child is using it */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(fd, 0);
+ close(fd);
+
+ /* Wait for second child */
+ waitpid(pid2, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+}
+
+/*
+ * Test user namespace active ref tracking via credential lifecycle
+ */
+TEST(userns_active_ref_lifecycle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new user namespace */
+ ret = unshare(CLONE_NEWUSER);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Set up uid/gid mappings */
+ int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+ int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+ int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+ if (uid_map_fd >= 0 && gid_map_fd >= 0 && setgroups_fd >= 0) {
+ write(setgroups_fd, "deny", 4);
+ close(setgroups_fd);
+
+ char mapping[64];
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+ write(uid_map_fd, mapping, strlen(mapping));
+ close(uid_map_fd);
+
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+ write(gid_map_fd, mapping, strlen(mapping));
+ close(gid_map_fd);
+ }
+
+ /* Get file handle */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /* Namespace should be inactive after all tasks exit */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test PID namespace active ref tracking
+ */
+TEST(pidns_active_ref_lifecycle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new PID namespace */
+ ret = unshare(CLONE_NEWPID);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Fork to actually enter the PID namespace */
+ pid_t child = fork();
+ if (child < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (child == 0) {
+ /* Grandchild - in new PID namespace */
+ fd = open("/proc/self/ns/pid", O_RDONLY);
+ if (fd < 0) {
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ exit(1);
+ }
+
+ /* Send handle to grandparent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Wait for grandchild */
+ waitpid(child, NULL, 0);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /* Namespace should be inactive after all processes exit */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that an open file descriptor keeps a namespace active.
+ * Even after the creating process exits, the namespace should remain
+ * active as long as an fd is held open.
+ */
+TEST(ns_fd_keeps_active)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int nsfd;
+ int pipe_child_ready[2];
+ int pipe_parent_ready[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+ char proc_path[64];
+
+ ASSERT_EQ(pipe(pipe_child_ready), 0);
+ ASSERT_EQ(pipe(pipe_parent_ready), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipe_child_ready[0]);
+ close(pipe_parent_ready[1]);
+
+ TH_LOG("Child: creating new network namespace");
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ TH_LOG("Child: unshare(CLONE_NEWNET) failed: %s", strerror(errno));
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+ exit(1);
+ }
+
+ TH_LOG("Child: network namespace created successfully");
+
+ /* Get file handle for the namespace */
+ nsfd = open("/proc/self/ns/net", O_RDONLY);
+ if (nsfd < 0) {
+ TH_LOG("Child: failed to open /proc/self/ns/net: %s", strerror(errno));
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+ exit(1);
+ }
+
+ TH_LOG("Child: opened namespace fd %d", nsfd);
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(nsfd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(nsfd);
+
+ if (ret < 0) {
+ TH_LOG("Child: name_to_handle_at failed: %s", strerror(errno));
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+ exit(1);
+ }
+
+ TH_LOG("Child: got file handle (bytes=%u)", handle->handle_bytes);
+
+ /* Send file handle to parent */
+ ret = write(pipe_child_ready[1], buf, sizeof(*handle) + handle->handle_bytes);
+ TH_LOG("Child: sent %d bytes of file handle to parent", ret);
+ close(pipe_child_ready[1]);
+
+ /* Wait for parent to open the fd */
+ TH_LOG("Child: waiting for parent to open fd");
+ ret = read(pipe_parent_ready[0], &sync_byte, 1);
+ close(pipe_parent_ready[0]);
+
+ TH_LOG("Child: parent signaled (read %d bytes), exiting now", ret);
+ /* Exit - namespace should stay active because parent holds fd */
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+
+ TH_LOG("Parent: reading file handle from child");
+
+ /* Read file handle from child */
+ ret = read(pipe_child_ready[0], buf, sizeof(buf));
+ close(pipe_child_ready[0]);
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ TH_LOG("Parent: received %d bytes, handle size=%u", ret, handle->handle_bytes);
+
+ /* Open the child's namespace while it's still alive */
+ snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/net", pid);
+ TH_LOG("Parent: opening child's namespace at %s", proc_path);
+ nsfd = open(proc_path, O_RDONLY);
+ if (nsfd < 0) {
+ TH_LOG("Parent: failed to open %s: %s", proc_path, strerror(errno));
+ close(pipe_parent_ready[1]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child's namespace");
+ }
+
+ TH_LOG("Parent: opened child's namespace, got fd %d", nsfd);
+
+ /* Signal child that we have the fd */
+ sync_byte = 'G';
+ write(pipe_parent_ready[1], &sync_byte, 1);
+ close(pipe_parent_ready[1]);
+ TH_LOG("Parent: signaled child that we have the fd");
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ TH_LOG("Child exited, parent holds fd %d to namespace", nsfd);
+
+ /*
+ * Namespace should still be ACTIVE because we hold an fd.
+ * We should be able to reopen it via file handle.
+ */
+ TH_LOG("Attempting to reopen namespace via file handle (should succeed - fd held)");
+ int fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(fd2, 0);
+
+ TH_LOG("Successfully reopened namespace via file handle, got fd %d", fd2);
+
+ /* Verify it's the same namespace */
+ struct stat st1, st2;
+ ASSERT_EQ(fstat(nsfd, &st1), 0);
+ ASSERT_EQ(fstat(fd2, &st2), 0);
+ TH_LOG("Namespace inodes: nsfd=%lu, fd2=%lu", st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ close(fd2);
+
+ /* Now close the fd - namespace should become inactive */
+ TH_LOG("Closing fd %d - namespace should become inactive", nsfd);
+ close(nsfd);
+
+ /* Now reopening should fail - namespace is inactive */
+ TH_LOG("Attempting to reopen namespace via file handle (should fail - inactive)");
+ fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd2, 0);
+ /* Should fail with ENOENT (inactive) or ESTALE (gone) */
+ TH_LOG("Reopen failed as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test hierarchical active reference propagation.
+ * When a child namespace is active, its owning user namespace should also
+ * be active automatically due to hierarchical active reference propagation.
+ * This ensures parents are always reachable when children are active.
+ */
+TEST(ns_parent_always_reachable)
+{
+ struct file_handle *parent_handle, *child_handle;
+ int ret;
+ int child_nsfd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 parent_id, child_id;
+ char parent_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ];
+ char child_buf[sizeof(*child_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ TH_LOG("Child: creating parent user namespace and setting up mappings");
+
+ /* Create parent user namespace with mappings */
+ ret = setup_userns();
+ if (ret < 0) {
+ TH_LOG("Child: setup_userns() for parent failed: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: parent user namespace created, now uid=%d gid=%d", getuid(), getgid());
+
+ /* Get namespace ID for parent user namespace */
+ int parent_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (parent_fd < 0) {
+ TH_LOG("Child: failed to open parent /proc/self/ns/user: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: opened parent userns fd %d", parent_fd);
+
+ if (ioctl(parent_fd, NS_GET_ID, &parent_id) < 0) {
+ TH_LOG("Child: NS_GET_ID for parent failed: %s", strerror(errno));
+ close(parent_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(parent_fd);
+
+ TH_LOG("Child: got parent namespace ID %llu", (unsigned long long)parent_id);
+
+ /* Create child user namespace within parent */
+ TH_LOG("Child: creating nested child user namespace");
+ ret = setup_userns();
+ if (ret < 0) {
+ TH_LOG("Child: setup_userns() for child failed: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: nested child user namespace created, uid=%d gid=%d", getuid(), getgid());
+
+ /* Get namespace ID for child user namespace */
+ int child_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (child_fd < 0) {
+ TH_LOG("Child: failed to open child /proc/self/ns/user: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: opened child userns fd %d", child_fd);
+
+ if (ioctl(child_fd, NS_GET_ID, &child_id) < 0) {
+ TH_LOG("Child: NS_GET_ID for child failed: %s", strerror(errno));
+ close(child_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(child_fd);
+
+ TH_LOG("Child: got child namespace ID %llu", (unsigned long long)child_id);
+
+ /* Send both namespace IDs to parent */
+ TH_LOG("Child: sending both namespace IDs to parent");
+ write(pipefd[1], &parent_id, sizeof(parent_id));
+ write(pipefd[1], &child_id, sizeof(child_id));
+ close(pipefd[1]);
+
+ TH_LOG("Child: exiting - parent userns should become inactive");
+ /* Exit - parent user namespace should become inactive */
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ TH_LOG("Parent: reading both namespace IDs from child");
+
+ /* Read both namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &parent_id, sizeof(parent_id));
+ if (ret != sizeof(parent_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &child_id, sizeof(child_id));
+ close(pipefd[0]);
+ if (ret != sizeof(child_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read child namespace ID from child");
+ }
+
+ TH_LOG("Parent: received parent_id=%llu, child_id=%llu",
+ (unsigned long long)parent_id, (unsigned long long)child_id);
+
+ /* Construct file handles from namespace IDs */
+ parent_handle = (struct file_handle *)parent_buf;
+ parent_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ parent_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *parent_fh = (struct nsfs_file_handle *)parent_handle->f_handle;
+ parent_fh->ns_id = parent_id;
+ parent_fh->ns_type = 0;
+ parent_fh->ns_inum = 0;
+
+ child_handle = (struct file_handle *)child_buf;
+ child_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ child_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *child_fh = (struct nsfs_file_handle *)child_handle->f_handle;
+ child_fh->ns_id = child_id;
+ child_fh->ns_type = 0;
+ child_fh->ns_inum = 0;
+
+ TH_LOG("Parent: opening child namespace BEFORE child exits");
+
+ /* Open child namespace while child is still alive to keep it active */
+ child_nsfd = open_by_handle_at(FD_NSFS_ROOT, child_handle, O_RDONLY);
+ if (child_nsfd < 0) {
+ TH_LOG("Failed to open child namespace: %s (errno=%d)", strerror(errno), errno);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child namespace");
+ }
+
+ TH_LOG("Opened child namespace fd %d", child_nsfd);
+
+ /* Now wait for child to exit */
+ TH_LOG("Parent: waiting for child to exit");
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ TH_LOG("Child process exited, parent holds fd to child namespace");
+
+ /*
+ * With hierarchical active reference propagation:
+ * Since the child namespace is active (parent process holds fd),
+ * the parent user namespace should ALSO be active automatically.
+ * This is because when we took an active reference on the child,
+ * it propagated up to the owning user namespace.
+ */
+ TH_LOG("Attempting to reopen parent namespace (should SUCCEED - hierarchical propagation)");
+ int parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(parent_fd, 0);
+
+ TH_LOG("SUCCESS: Parent namespace is active (fd=%d) due to active child", parent_fd);
+
+ /* Verify we can also get parent via NS_GET_USERNS */
+ TH_LOG("Verifying NS_GET_USERNS also works");
+ int parent_fd2 = ioctl(child_nsfd, NS_GET_USERNS);
+ if (parent_fd2 < 0) {
+ close(parent_fd);
+ close(child_nsfd);
+ TH_LOG("NS_GET_USERNS failed: %s (errno=%d)", strerror(errno), errno);
+ SKIP(return, "NS_GET_USERNS not supported or failed");
+ }
+
+ TH_LOG("NS_GET_USERNS succeeded, got parent fd %d", parent_fd2);
+
+ /* Verify both methods give us the same namespace */
+ struct stat st1, st2;
+ ASSERT_EQ(fstat(parent_fd, &st1), 0);
+ ASSERT_EQ(fstat(parent_fd2, &st2), 0);
+ TH_LOG("Parent namespace inodes: parent_fd=%lu, parent_fd2=%lu", st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ /*
+ * Close child fd - parent should remain active because we still
+ * hold direct references to it (parent_fd and parent_fd2).
+ */
+ TH_LOG("Closing child fd - parent should remain active (direct refs held)");
+ close(child_nsfd);
+
+ /* Parent should still be openable */
+ TH_LOG("Verifying parent still active via file handle");
+ int parent_fd3 = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(parent_fd3, 0);
+ close(parent_fd3);
+
+ TH_LOG("Closing all fds to parent namespace");
+ close(parent_fd);
+ close(parent_fd2);
+
+ /* Both should now be inactive */
+ TH_LOG("Attempting to reopen parent (should fail - inactive, no refs)");
+ parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_LT(parent_fd, 0);
+ TH_LOG("Parent inactive as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that bind mounts keep namespaces in the tree even when inactive
+ */
+TEST(ns_bind_mount_keeps_in_tree)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+ char tmpfile[] = "/tmp/ns-test-XXXXXX";
+ int tmpfd;
+
+ /* Create temporary file for bind mount */
+ tmpfd = mkstemp(tmpfile);
+ if (tmpfd < 0) {
+ SKIP(return, "Cannot create temporary file");
+ }
+ close(tmpfd);
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Unshare mount namespace and make mounts private to avoid propagation */
+ ret = unshare(CLONE_NEWNS);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+ ret = mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Bind mount the namespace */
+ ret = mount("/proc/self/ns/net", tmpfile, NULL, MS_BIND, NULL);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Get file handle */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ umount(tmpfile);
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ umount(tmpfile);
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /*
+ * Namespace should be inactive but still in tree due to bind mount.
+ * Reopening should fail with ENOENT (inactive) not ESTALE (not in tree).
+ */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ /* Should be ENOENT (inactive) since bind mount keeps it in tree */
+ if (errno != ENOENT && errno != ESTALE) {
+ TH_LOG("Unexpected error: %d", errno);
+ }
+
+ /* Cleanup */
+ umount(tmpfile);
+ unlink(tmpfile);
+}
+
+/*
+ * Test multi-level hierarchy (3+ levels deep).
+ * Grandparent → Parent → Child
+ * When child is active, both parent AND grandparent should be active.
+ */
+TEST(ns_multilevel_hierarchy)
+{
+ struct file_handle *gp_handle, *p_handle, *c_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 gp_id, p_id, c_id;
+ char gp_buf[sizeof(*gp_handle) + MAX_HANDLE_SZ];
+ char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ];
+ char c_buf[sizeof(*c_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create grandparent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int gp_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (gp_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(gp_fd, NS_GET_ID, &gp_id) < 0) {
+ close(gp_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(gp_fd);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int p_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (p_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) {
+ close(p_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(p_fd);
+
+ /* Create child user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int c_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (c_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(c_fd, NS_GET_ID, &c_id) < 0) {
+ close(c_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(c_fd);
+
+ /* Send all three namespace IDs */
+ write(pipefd[1], &gp_id, sizeof(gp_id));
+ write(pipefd[1], &p_id, sizeof(p_id));
+ write(pipefd[1], &c_id, sizeof(c_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &gp_id, sizeof(gp_id));
+ if (ret != sizeof(gp_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read grandparent namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &p_id, sizeof(p_id));
+ if (ret != sizeof(p_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &c_id, sizeof(c_id));
+ close(pipefd[0]);
+ if (ret != sizeof(c_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read child namespace ID from child");
+ }
+
+ /* Construct file handles from namespace IDs */
+ gp_handle = (struct file_handle *)gp_buf;
+ gp_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ gp_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *gp_fh = (struct nsfs_file_handle *)gp_handle->f_handle;
+ gp_fh->ns_id = gp_id;
+ gp_fh->ns_type = 0;
+ gp_fh->ns_inum = 0;
+
+ p_handle = (struct file_handle *)p_buf;
+ p_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ p_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle;
+ p_fh->ns_id = p_id;
+ p_fh->ns_type = 0;
+ p_fh->ns_inum = 0;
+
+ c_handle = (struct file_handle *)c_buf;
+ c_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ c_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *c_fh = (struct nsfs_file_handle *)c_handle->f_handle;
+ c_fh->ns_id = c_id;
+ c_fh->ns_type = 0;
+ c_fh->ns_inum = 0;
+
+ /* Open child before process exits */
+ int c_fd = open_by_handle_at(FD_NSFS_ROOT, c_handle, O_RDONLY);
+ if (c_fd < 0) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child namespace");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /*
+ * With 3-level hierarchy and child active:
+ * - Child is active (we hold fd)
+ * - Parent should be active (propagated from child)
+ * - Grandparent should be active (propagated from parent)
+ */
+ TH_LOG("Testing parent active when child is active");
+ int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+
+ TH_LOG("Testing grandparent active when child is active");
+ int gp_fd = open_by_handle_at(FD_NSFS_ROOT, gp_handle, O_RDONLY);
+ ASSERT_GE(gp_fd, 0);
+
+ close(c_fd);
+ close(p_fd);
+ close(gp_fd);
+}
+
+/*
+ * Test multiple children sharing same parent.
+ * Parent should stay active as long as ANY child is active.
+ */
+TEST(ns_multiple_children_same_parent)
+{
+ struct file_handle *p_handle, *c1_handle, *c2_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 p_id, c1_id, c2_id;
+ char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ];
+ char c1_buf[sizeof(*c1_handle) + MAX_HANDLE_SZ];
+ char c2_buf[sizeof(*c2_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int p_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (p_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) {
+ close(p_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(p_fd);
+
+ /* Create first child user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int c1_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (c1_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(c1_fd, NS_GET_ID, &c1_id) < 0) {
+ close(c1_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(c1_fd);
+
+ /* Return to parent user namespace and create second child */
+ /* We can't actually do this easily, so let's create a sibling namespace
+ * by creating a network namespace instead */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int c2_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (c2_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(c2_fd, NS_GET_ID, &c2_id) < 0) {
+ close(c2_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(c2_fd);
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &p_id, sizeof(p_id));
+ write(pipefd[1], &c1_id, sizeof(c1_id));
+ write(pipefd[1], &c2_id, sizeof(c2_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &p_id, sizeof(p_id));
+ if (ret != sizeof(p_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID");
+ }
+
+ ret = read(pipefd[0], &c1_id, sizeof(c1_id));
+ if (ret != sizeof(c1_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read first child namespace ID");
+ }
+
+ ret = read(pipefd[0], &c2_id, sizeof(c2_id));
+ close(pipefd[0]);
+ if (ret != sizeof(c2_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read second child namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ p_handle = (struct file_handle *)p_buf;
+ p_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ p_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle;
+ p_fh->ns_id = p_id;
+ p_fh->ns_type = 0;
+ p_fh->ns_inum = 0;
+
+ c1_handle = (struct file_handle *)c1_buf;
+ c1_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ c1_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *c1_fh = (struct nsfs_file_handle *)c1_handle->f_handle;
+ c1_fh->ns_id = c1_id;
+ c1_fh->ns_type = 0;
+ c1_fh->ns_inum = 0;
+
+ c2_handle = (struct file_handle *)c2_buf;
+ c2_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ c2_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *c2_fh = (struct nsfs_file_handle *)c2_handle->f_handle;
+ c2_fh->ns_id = c2_id;
+ c2_fh->ns_type = 0;
+ c2_fh->ns_inum = 0;
+
+ /* Open both children before process exits */
+ int c1_fd = open_by_handle_at(FD_NSFS_ROOT, c1_handle, O_RDONLY);
+ int c2_fd = open_by_handle_at(FD_NSFS_ROOT, c2_handle, O_RDONLY);
+
+ if (c1_fd < 0 || c2_fd < 0) {
+ if (c1_fd >= 0) close(c1_fd);
+ if (c2_fd >= 0) close(c2_fd);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child namespaces");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Parent should be active (both children active) */
+ TH_LOG("Both children active - parent should be active");
+ int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close first child - parent should STILL be active */
+ TH_LOG("Closing first child - parent should still be active");
+ close(c1_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close second child - NOW parent should become inactive */
+ TH_LOG("Closing second child - parent should become inactive");
+ close(c2_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_LT(p_fd, 0);
+}
+
+/*
+ * Test that different namespace types with same owner all contribute
+ * active references to the owning user namespace.
+ */
+TEST(ns_different_types_same_owner)
+{
+ struct file_handle *u_handle, *n_handle, *ut_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 u_id, n_id, ut_id;
+ char u_buf[sizeof(*u_handle) + MAX_HANDLE_SZ];
+ char n_buf[sizeof(*n_handle) + MAX_HANDLE_SZ];
+ char ut_buf[sizeof(*ut_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int u_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (u_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) {
+ close(u_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(u_fd);
+
+ /* Create network namespace (owned by user namespace) */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int n_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) {
+ close(n_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(n_fd);
+
+ /* Create UTS namespace (also owned by user namespace) */
+ if (unshare(CLONE_NEWUTS) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ut_fd = open("/proc/self/ns/uts", O_RDONLY);
+ if (ut_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) {
+ close(ut_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ut_fd);
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &u_id, sizeof(u_id));
+ write(pipefd[1], &n_id, sizeof(n_id));
+ write(pipefd[1], &ut_id, sizeof(ut_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &u_id, sizeof(u_id));
+ if (ret != sizeof(u_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user namespace ID");
+ }
+
+ ret = read(pipefd[0], &n_id, sizeof(n_id));
+ if (ret != sizeof(n_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID");
+ }
+
+ ret = read(pipefd[0], &ut_id, sizeof(ut_id));
+ close(pipefd[0]);
+ if (ret != sizeof(ut_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read UTS namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ u_handle = (struct file_handle *)u_buf;
+ u_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ u_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)u_handle->f_handle;
+ u_fh->ns_id = u_id;
+ u_fh->ns_type = 0;
+ u_fh->ns_inum = 0;
+
+ n_handle = (struct file_handle *)n_buf;
+ n_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ n_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)n_handle->f_handle;
+ n_fh->ns_id = n_id;
+ n_fh->ns_type = 0;
+ n_fh->ns_inum = 0;
+
+ ut_handle = (struct file_handle *)ut_buf;
+ ut_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ut_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)ut_handle->f_handle;
+ ut_fh->ns_id = ut_id;
+ ut_fh->ns_type = 0;
+ ut_fh->ns_inum = 0;
+
+ /* Open both non-user namespaces before process exits */
+ int n_fd = open_by_handle_at(FD_NSFS_ROOT, n_handle, O_RDONLY);
+ int ut_fd = open_by_handle_at(FD_NSFS_ROOT, ut_handle, O_RDONLY);
+
+ if (n_fd < 0 || ut_fd < 0) {
+ if (n_fd >= 0) close(n_fd);
+ if (ut_fd >= 0) close(ut_fd);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open namespaces");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /*
+ * Both network and UTS namespaces are active.
+ * User namespace should be active (gets 2 active refs).
+ */
+ TH_LOG("Both net and uts active - user namespace should be active");
+ int u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close network namespace - user namespace should STILL be active */
+ TH_LOG("Closing network ns - user ns should still be active (uts still active)");
+ close(n_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close UTS namespace - user namespace should become inactive */
+ TH_LOG("Closing uts ns - user ns should become inactive");
+ close(ut_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY);
+ ASSERT_LT(u_fd, 0);
+}
+
+/*
+ * Test hierarchical propagation with deep namespace hierarchy.
+ * Create: init_user_ns -> user_A -> user_B -> net_ns
+ * When net_ns is active, both user_A and user_B should be active.
+ * This verifies the conditional recursion in __ns_ref_active_put() works.
+ */
+TEST(ns_deep_hierarchy_propagation)
+{
+ struct file_handle *ua_handle, *ub_handle, *net_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 ua_id, ub_id, net_id;
+ char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ];
+ char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ];
+ char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create user_A -> user_B -> net hierarchy */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ua_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ua_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) {
+ close(ua_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ua_fd);
+
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ub_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ub_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) {
+ close(ub_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ub_fd);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int net_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (net_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) {
+ close(net_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(net_fd);
+
+ /* Send all three namespace IDs */
+ write(pipefd[1], &ua_id, sizeof(ua_id));
+ write(pipefd[1], &ub_id, sizeof(ub_id));
+ write(pipefd[1], &net_id, sizeof(net_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &ua_id, sizeof(ua_id));
+ if (ret != sizeof(ua_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_A namespace ID");
+ }
+
+ ret = read(pipefd[0], &ub_id, sizeof(ub_id));
+ if (ret != sizeof(ub_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_B namespace ID");
+ }
+
+ ret = read(pipefd[0], &net_id, sizeof(net_id));
+ close(pipefd[0]);
+ if (ret != sizeof(net_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ ua_handle = (struct file_handle *)ua_buf;
+ ua_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ua_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle;
+ ua_fh->ns_id = ua_id;
+ ua_fh->ns_type = 0;
+ ua_fh->ns_inum = 0;
+
+ ub_handle = (struct file_handle *)ub_buf;
+ ub_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ub_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle;
+ ub_fh->ns_id = ub_id;
+ ub_fh->ns_type = 0;
+ ub_fh->ns_inum = 0;
+
+ net_handle = (struct file_handle *)net_buf;
+ net_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle;
+ net_fh->ns_id = net_id;
+ net_fh->ns_type = 0;
+ net_fh->ns_inum = 0;
+
+ /* Open net_ns before child exits to keep it active */
+ int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ if (net_fd < 0) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open network namespace");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* With net_ns active, both user_A and user_B should be active */
+ TH_LOG("Testing user_B active (net_ns active causes propagation)");
+ int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY);
+ ASSERT_GE(ub_fd, 0);
+
+ TH_LOG("Testing user_A active (propagated through user_B)");
+ int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd, 0);
+
+ /* Close net_ns - user_B should stay active (we hold direct ref) */
+ TH_LOG("Closing net_ns, user_B should remain active (direct ref held)");
+ close(net_fd);
+ int ub_fd2 = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY);
+ ASSERT_GE(ub_fd2, 0);
+ close(ub_fd2);
+
+ /* Close user_B - user_A should stay active (we hold direct ref) */
+ TH_LOG("Closing user_B, user_A should remain active (direct ref held)");
+ close(ub_fd);
+ int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd2, 0);
+ close(ua_fd2);
+
+ /* Close user_A - everything should become inactive */
+ TH_LOG("Closing user_A, all should become inactive");
+ close(ua_fd);
+
+ /* All should now be inactive */
+ ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_LT(ua_fd, 0);
+}
+
+/*
+ * Test that parent stays active as long as ANY child is active.
+ * Create parent user namespace with two child net namespaces.
+ * Parent should remain active until BOTH children are inactive.
+ */
+TEST(ns_parent_multiple_children_refcount)
+{
+ struct file_handle *parent_handle, *net1_handle, *net2_handle;
+ int ret, pipefd[2], syncpipe[2];
+ pid_t pid;
+ int status;
+ __u64 p_id, n1_id, n2_id;
+ char p_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ];
+ char n1_buf[sizeof(*net1_handle) + MAX_HANDLE_SZ];
+ char n2_buf[sizeof(*net2_handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+ close(syncpipe[1]);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int p_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (p_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) {
+ close(p_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(p_fd);
+
+ /* Create first network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ int n1_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n1_fd < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ if (ioctl(n1_fd, NS_GET_ID, &n1_id) < 0) {
+ close(n1_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ /* Keep n1_fd open so first namespace stays active */
+
+ /* Create second network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(n1_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ int n2_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n2_fd < 0) {
+ close(n1_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ if (ioctl(n2_fd, NS_GET_ID, &n2_id) < 0) {
+ close(n1_fd);
+ close(n2_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ /* Keep both n1_fd and n2_fd open */
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &p_id, sizeof(p_id));
+ write(pipefd[1], &n1_id, sizeof(n1_id));
+ write(pipefd[1], &n2_id, sizeof(n2_id));
+ close(pipefd[1]);
+
+ /* Wait for parent to signal before exiting */
+ read(syncpipe[0], &sync_byte, 1);
+ close(syncpipe[0]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+ close(syncpipe[0]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &p_id, sizeof(p_id));
+ if (ret != sizeof(p_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID");
+ }
+
+ ret = read(pipefd[0], &n1_id, sizeof(n1_id));
+ if (ret != sizeof(n1_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read first network namespace ID");
+ }
+
+ ret = read(pipefd[0], &n2_id, sizeof(n2_id));
+ close(pipefd[0]);
+ if (ret != sizeof(n2_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read second network namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ parent_handle = (struct file_handle *)p_buf;
+ parent_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ parent_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)parent_handle->f_handle;
+ p_fh->ns_id = p_id;
+ p_fh->ns_type = 0;
+ p_fh->ns_inum = 0;
+
+ net1_handle = (struct file_handle *)n1_buf;
+ net1_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net1_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n1_fh = (struct nsfs_file_handle *)net1_handle->f_handle;
+ n1_fh->ns_id = n1_id;
+ n1_fh->ns_type = 0;
+ n1_fh->ns_inum = 0;
+
+ net2_handle = (struct file_handle *)n2_buf;
+ net2_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net2_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n2_fh = (struct nsfs_file_handle *)net2_handle->f_handle;
+ n2_fh->ns_id = n2_id;
+ n2_fh->ns_type = 0;
+ n2_fh->ns_inum = 0;
+
+ /* Open both net namespaces while child is still alive */
+ int n1_fd = open_by_handle_at(FD_NSFS_ROOT, net1_handle, O_RDONLY);
+ int n2_fd = open_by_handle_at(FD_NSFS_ROOT, net2_handle, O_RDONLY);
+ if (n1_fd < 0 || n2_fd < 0) {
+ if (n1_fd >= 0) close(n1_fd);
+ if (n2_fd >= 0) close(n2_fd);
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open net namespaces");
+ }
+
+ /* Signal child that we have opened the namespaces */
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Parent should be active (has 2 active children) */
+ TH_LOG("Both net namespaces active - parent should be active");
+ int p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close first net namespace - parent should STILL be active */
+ TH_LOG("Closing first net ns - parent should still be active");
+ close(n1_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close second net namespace - parent should become inactive */
+ TH_LOG("Closing second net ns - parent should become inactive");
+ close(n2_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_LT(p_fd, 0);
+}
+
+/*
+ * Test that user namespace as a child also propagates correctly.
+ * Create user_A -> user_B, verify when user_B is active that user_A
+ * is also active. This is different from non-user namespace children.
+ */
+TEST(ns_userns_child_propagation)
+{
+ struct file_handle *ua_handle, *ub_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 ua_id, ub_id;
+ char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ];
+ char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create user_A */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ua_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ua_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) {
+ close(ua_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ua_fd);
+
+ /* Create user_B (child of user_A) */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ub_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ub_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) {
+ close(ub_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ub_fd);
+
+ /* Send both namespace IDs */
+ write(pipefd[1], &ua_id, sizeof(ua_id));
+ write(pipefd[1], &ub_id, sizeof(ub_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read both namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &ua_id, sizeof(ua_id));
+ if (ret != sizeof(ua_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_A namespace ID");
+ }
+
+ ret = read(pipefd[0], &ub_id, sizeof(ub_id));
+ close(pipefd[0]);
+ if (ret != sizeof(ub_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_B namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ ua_handle = (struct file_handle *)ua_buf;
+ ua_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ua_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle;
+ ua_fh->ns_id = ua_id;
+ ua_fh->ns_type = 0;
+ ua_fh->ns_inum = 0;
+
+ ub_handle = (struct file_handle *)ub_buf;
+ ub_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ub_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle;
+ ub_fh->ns_id = ub_id;
+ ub_fh->ns_type = 0;
+ ub_fh->ns_inum = 0;
+
+ /* Open user_B before child exits */
+ int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY);
+ if (ub_fd < 0) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open user_B");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* With user_B active, user_A should also be active */
+ TH_LOG("Testing user_A active when child user_B is active");
+ int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd, 0);
+
+ /* Close user_B */
+ TH_LOG("Closing user_B");
+ close(ub_fd);
+
+ /* user_A should remain active (we hold direct ref) */
+ int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd2, 0);
+ close(ua_fd2);
+
+ /* Close user_A - should become inactive */
+ TH_LOG("Closing user_A - should become inactive");
+ close(ua_fd);
+
+ ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_LT(ua_fd, 0);
+}
+
+/*
+ * Test different namespace types (net, uts, ipc) all contributing
+ * active references to the same owning user namespace.
+ */
+TEST(ns_mixed_types_same_owner)
+{
+ struct file_handle *user_handle, *net_handle, *uts_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 u_id, n_id, ut_id;
+ char u_buf[sizeof(*user_handle) + MAX_HANDLE_SZ];
+ char n_buf[sizeof(*net_handle) + MAX_HANDLE_SZ];
+ char ut_buf[sizeof(*uts_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int u_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (u_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) {
+ close(u_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(u_fd);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int n_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) {
+ close(n_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(n_fd);
+
+ if (unshare(CLONE_NEWUTS) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ut_fd = open("/proc/self/ns/uts", O_RDONLY);
+ if (ut_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) {
+ close(ut_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ut_fd);
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &u_id, sizeof(u_id));
+ write(pipefd[1], &n_id, sizeof(n_id));
+ write(pipefd[1], &ut_id, sizeof(ut_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &u_id, sizeof(u_id));
+ if (ret != sizeof(u_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user namespace ID");
+ }
+
+ ret = read(pipefd[0], &n_id, sizeof(n_id));
+ if (ret != sizeof(n_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID");
+ }
+
+ ret = read(pipefd[0], &ut_id, sizeof(ut_id));
+ close(pipefd[0]);
+ if (ret != sizeof(ut_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read UTS namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ user_handle = (struct file_handle *)u_buf;
+ user_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ user_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)user_handle->f_handle;
+ u_fh->ns_id = u_id;
+ u_fh->ns_type = 0;
+ u_fh->ns_inum = 0;
+
+ net_handle = (struct file_handle *)n_buf;
+ net_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)net_handle->f_handle;
+ n_fh->ns_id = n_id;
+ n_fh->ns_type = 0;
+ n_fh->ns_inum = 0;
+
+ uts_handle = (struct file_handle *)ut_buf;
+ uts_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ uts_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)uts_handle->f_handle;
+ ut_fh->ns_id = ut_id;
+ ut_fh->ns_type = 0;
+ ut_fh->ns_inum = 0;
+
+ /* Open both non-user namespaces */
+ int n_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ int ut_fd = open_by_handle_at(FD_NSFS_ROOT, uts_handle, O_RDONLY);
+ if (n_fd < 0 || ut_fd < 0) {
+ if (n_fd >= 0) close(n_fd);
+ if (ut_fd >= 0) close(ut_fd);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open namespaces");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* User namespace should be active (2 active children) */
+ TH_LOG("Both net and uts active - user ns should be active");
+ int u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close net - user ns should STILL be active (uts still active) */
+ TH_LOG("Closing net - user ns should still be active");
+ close(n_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close uts - user ns should become inactive */
+ TH_LOG("Closing uts - user ns should become inactive");
+ close(ut_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_LT(u_fd, 0);
+}
+
+/* Thread test helpers and structures */
+struct thread_ns_info {
+ __u64 ns_id;
+ int pipefd;
+ int syncfd_read;
+ int syncfd_write;
+ int exit_code;
+};
+
+static void *thread_create_namespace(void *arg)
+{
+ struct thread_ns_info *info = (struct thread_ns_info *)arg;
+ int ret;
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ info->exit_code = 1;
+ return NULL;
+ }
+
+ /* Get namespace ID */
+ int fd = open("/proc/thread-self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ info->exit_code = 2;
+ return NULL;
+ }
+
+ ret = ioctl(fd, NS_GET_ID, &info->ns_id);
+ close(fd);
+ if (ret < 0) {
+ info->exit_code = 3;
+ return NULL;
+ }
+
+ /* Send namespace ID to main thread */
+ if (write(info->pipefd, &info->ns_id, sizeof(info->ns_id)) != sizeof(info->ns_id)) {
+ info->exit_code = 4;
+ return NULL;
+ }
+
+ /* Wait for signal to exit */
+ char sync_byte;
+ if (read(info->syncfd_read, &sync_byte, 1) != 1) {
+ info->exit_code = 5;
+ return NULL;
+ }
+
+ info->exit_code = 0;
+ return NULL;
+}
+
+/*
+ * Test that namespace becomes inactive after thread exits.
+ * This verifies active reference counting works with threads, not just processes.
+ */
+TEST(thread_ns_inactive_after_exit)
+{
+ pthread_t thread;
+ struct thread_ns_info info;
+ struct file_handle *handle;
+ int pipefd[2];
+ int syncpipe[2];
+ int ret;
+ char sync_byte;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ info.pipefd = pipefd[1];
+ info.syncfd_read = syncpipe[0];
+ info.syncfd_write = -1;
+ info.exit_code = -1;
+
+ /* Create thread that will create a namespace */
+ ret = pthread_create(&thread, NULL, thread_create_namespace, &info);
+ ASSERT_EQ(ret, 0);
+
+ /* Read namespace ID from thread */
+ __u64 ns_id;
+ ret = read(pipefd[0], &ns_id, sizeof(ns_id));
+ if (ret != sizeof(ns_id)) {
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ pthread_join(thread, NULL);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ close(syncpipe[1]);
+ SKIP(return, "Failed to read namespace ID from thread");
+ }
+
+ TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id);
+
+ /* Construct file handle */
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle;
+ fh->ns_id = ns_id;
+ fh->ns_type = 0;
+ fh->ns_inum = 0;
+
+ /* Namespace should be active while thread is alive */
+ TH_LOG("Attempting to open namespace while thread is alive (should succeed)");
+ int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(nsfd, 0);
+ close(nsfd);
+
+ /* Signal thread to exit */
+ TH_LOG("Signaling thread to exit");
+ sync_byte = 'X';
+ ASSERT_EQ(write(syncpipe[1], &sync_byte, 1), 1);
+ close(syncpipe[1]);
+
+ /* Wait for thread to exit */
+ ASSERT_EQ(pthread_join(thread, NULL), 0);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+
+ if (info.exit_code != 0)
+ SKIP(return, "Thread failed to create namespace");
+
+ TH_LOG("Thread exited, namespace should be inactive");
+
+ /* Namespace should now be inactive */
+ nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(nsfd, 0);
+ /* Should fail with ENOENT (inactive) or ESTALE (gone) */
+ TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that a namespace remains active while a thread holds an fd to it.
+ * Even after the thread exits, the namespace should remain active as long as
+ * another thread holds a file descriptor to it.
+ */
+TEST(thread_ns_fd_keeps_active)
+{
+ pthread_t thread;
+ struct thread_ns_info info;
+ struct file_handle *handle;
+ int pipefd[2];
+ int syncpipe[2];
+ int ret;
+ char sync_byte;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ info.pipefd = pipefd[1];
+ info.syncfd_read = syncpipe[0];
+ info.syncfd_write = -1;
+ info.exit_code = -1;
+
+ /* Create thread that will create a namespace */
+ ret = pthread_create(&thread, NULL, thread_create_namespace, &info);
+ ASSERT_EQ(ret, 0);
+
+ /* Read namespace ID from thread */
+ __u64 ns_id;
+ ret = read(pipefd[0], &ns_id, sizeof(ns_id));
+ if (ret != sizeof(ns_id)) {
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ pthread_join(thread, NULL);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ close(syncpipe[1]);
+ SKIP(return, "Failed to read namespace ID from thread");
+ }
+
+ TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id);
+
+ /* Construct file handle */
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle;
+ fh->ns_id = ns_id;
+ fh->ns_type = 0;
+ fh->ns_inum = 0;
+
+ /* Open namespace while thread is alive */
+ TH_LOG("Opening namespace while thread is alive");
+ int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(nsfd, 0);
+
+ /* Signal thread to exit */
+ TH_LOG("Signaling thread to exit");
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ /* Wait for thread to exit */
+ pthread_join(thread, NULL);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+
+ if (info.exit_code != 0) {
+ close(nsfd);
+ SKIP(return, "Thread failed to create namespace");
+ }
+
+ TH_LOG("Thread exited, but main thread holds fd - namespace should remain active");
+
+ /* Namespace should still be active because we hold an fd */
+ int nsfd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(nsfd2, 0);
+
+ /* Verify it's the same namespace */
+ struct stat st1, st2;
+ ASSERT_EQ(fstat(nsfd, &st1), 0);
+ ASSERT_EQ(fstat(nsfd2, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ close(nsfd2);
+
+ TH_LOG("Closing fd - namespace should become inactive");
+ close(nsfd);
+
+ /* Now namespace should be inactive */
+ nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(nsfd, 0);
+ /* Should fail with ENOENT (inactive) or ESTALE (gone) */
+ TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/* Structure for thread data in subprocess */
+struct thread_sleep_data {
+ int syncfd_read;
+};
+
+static void *thread_sleep_and_wait(void *arg)
+{
+ struct thread_sleep_data *data = (struct thread_sleep_data *)arg;
+ char sync_byte;
+
+ /* Wait for signal to exit - read will unblock when pipe is closed */
+ (void)read(data->syncfd_read, &sync_byte, 1);
+ return NULL;
+}
+
+/*
+ * Test that namespaces become inactive after subprocess with multiple threads exits.
+ * Create a subprocess that unshares user and network namespaces, then creates two
+ * threads that share those namespaces. Verify that after all threads and subprocess
+ * exit, the namespaces are no longer listed by listns() and cannot be opened by
+ * open_by_handle_at().
+ */
+TEST(thread_subprocess_ns_inactive_after_all_exit)
+{
+ int pipefd[2];
+ int sv[2];
+ pid_t pid;
+ int status;
+ __u64 user_id, net_id;
+ struct file_handle *user_handle, *net_handle;
+ char user_buf[sizeof(*user_handle) + MAX_HANDLE_SZ];
+ char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+ int ret;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+ close(sv[0]);
+
+ /* Create user namespace with mappings */
+ if (setup_userns() < 0) {
+ fprintf(stderr, "Child: setup_userns() failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ fprintf(stderr, "Child: setup_userns() succeeded\n");
+
+ /* Get user namespace ID */
+ int user_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (user_fd < 0) {
+ fprintf(stderr, "Child: open(/proc/self/ns/user) failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(user_fd, NS_GET_ID, &user_id) < 0) {
+ fprintf(stderr, "Child: ioctl(NS_GET_ID) for user ns failed: %s\n", strerror(errno));
+ close(user_fd);
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ close(user_fd);
+ fprintf(stderr, "Child: user ns ID = %llu\n", (unsigned long long)user_id);
+
+ /* Unshare network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ fprintf(stderr, "Child: unshare(CLONE_NEWNET) failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ fprintf(stderr, "Child: unshare(CLONE_NEWNET) succeeded\n");
+
+ /* Get network namespace ID */
+ int net_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (net_fd < 0) {
+ fprintf(stderr, "Child: open(/proc/self/ns/net) failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) {
+ fprintf(stderr, "Child: ioctl(NS_GET_ID) for net ns failed: %s\n", strerror(errno));
+ close(net_fd);
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ close(net_fd);
+ fprintf(stderr, "Child: net ns ID = %llu\n", (unsigned long long)net_id);
+
+ /* Send namespace IDs to parent */
+ if (write(pipefd[1], &user_id, sizeof(user_id)) != sizeof(user_id)) {
+ fprintf(stderr, "Child: write(user_id) failed: %s\n", strerror(errno));
+ exit(1);
+ }
+ if (write(pipefd[1], &net_id, sizeof(net_id)) != sizeof(net_id)) {
+ fprintf(stderr, "Child: write(net_id) failed: %s\n", strerror(errno));
+ exit(1);
+ }
+ close(pipefd[1]);
+ fprintf(stderr, "Child: sent namespace IDs to parent\n");
+
+ /* Create two threads that share the namespaces */
+ pthread_t thread1, thread2;
+ struct thread_sleep_data data;
+ data.syncfd_read = sv[1];
+
+ int ret_thread = pthread_create(&thread1, NULL, thread_sleep_and_wait, &data);
+ if (ret_thread != 0) {
+ fprintf(stderr, "Child: pthread_create(thread1) failed: %s\n", strerror(ret_thread));
+ close(sv[1]);
+ exit(1);
+ }
+ fprintf(stderr, "Child: created thread1\n");
+
+ ret_thread = pthread_create(&thread2, NULL, thread_sleep_and_wait, &data);
+ if (ret_thread != 0) {
+ fprintf(stderr, "Child: pthread_create(thread2) failed: %s\n", strerror(ret_thread));
+ close(sv[1]);
+ pthread_cancel(thread1);
+ exit(1);
+ }
+ fprintf(stderr, "Child: created thread2\n");
+
+ /* Wait for threads to complete - they will unblock when parent writes */
+ fprintf(stderr, "Child: waiting for threads to exit\n");
+ pthread_join(thread1, NULL);
+ fprintf(stderr, "Child: thread1 exited\n");
+ pthread_join(thread2, NULL);
+ fprintf(stderr, "Child: thread2 exited\n");
+
+ close(sv[1]);
+
+ /* Exit - namespaces should become inactive */
+ fprintf(stderr, "Child: all threads joined, exiting with success\n");
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+ close(sv[1]);
+
+ TH_LOG("Parent: waiting to read namespace IDs from child");
+
+ /* Read namespace IDs from child */
+ ret = read(pipefd[0], &user_id, sizeof(user_id));
+ if (ret != sizeof(user_id)) {
+ TH_LOG("Parent: failed to read user_id, ret=%d, errno=%s", ret, strerror(errno));
+ close(pipefd[0]);
+ sync_byte = 'X';
+ (void)write(sv[0], &sync_byte, 1);
+ close(sv[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &net_id, sizeof(net_id));
+ close(pipefd[0]);
+ if (ret != sizeof(net_id)) {
+ TH_LOG("Parent: failed to read net_id, ret=%d, errno=%s", ret, strerror(errno));
+ sync_byte = 'X';
+ (void)write(sv[0], &sync_byte, 1);
+ close(sv[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID from child");
+ }
+
+ TH_LOG("Child created user ns %llu and net ns %llu with 2 threads",
+ (unsigned long long)user_id, (unsigned long long)net_id);
+
+ /* Construct file handles */
+ user_handle = (struct file_handle *)user_buf;
+ user_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ user_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *user_fh = (struct nsfs_file_handle *)user_handle->f_handle;
+ user_fh->ns_id = user_id;
+ user_fh->ns_type = 0;
+ user_fh->ns_inum = 0;
+
+ net_handle = (struct file_handle *)net_buf;
+ net_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle;
+ net_fh->ns_id = net_id;
+ net_fh->ns_type = 0;
+ net_fh->ns_inum = 0;
+
+ /* Verify namespaces are active while subprocess and threads are alive */
+ TH_LOG("Verifying namespaces are active while subprocess with threads is running");
+ int user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_GE(user_fd, 0);
+
+ int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ ASSERT_GE(net_fd, 0);
+
+ close(user_fd);
+ close(net_fd);
+
+ /* Also verify they appear in listns() */
+ TH_LOG("Verifying namespaces appear in listns() while active");
+ struct ns_id_req req = {
+ .size = sizeof(struct ns_id_req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids < 0) {
+ TH_LOG("listns() not available, skipping listns verification");
+ } else {
+ /* Check if user_id is in the list */
+ int found_user = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == user_id) {
+ found_user = 1;
+ break;
+ }
+ }
+ ASSERT_TRUE(found_user);
+ TH_LOG("User namespace found in listns() as expected");
+
+ /* Check network namespace */
+ req.ns_type = CLONE_NEWNET;
+ nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids >= 0) {
+ int found_net = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == net_id) {
+ found_net = 1;
+ break;
+ }
+ }
+ ASSERT_TRUE(found_net);
+ TH_LOG("Network namespace found in listns() as expected");
+ }
+ }
+
+ /* Signal threads to exit */
+ TH_LOG("Signaling threads to exit");
+ sync_byte = 'X';
+ /* Write two bytes - one for each thread */
+ ASSERT_EQ(write(sv[0], &sync_byte, 1), 1);
+ ASSERT_EQ(write(sv[0], &sync_byte, 1), 1);
+ close(sv[0]);
+
+ /* Wait for child process to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ if (WEXITSTATUS(status) != 0) {
+ TH_LOG("Child process failed with exit code %d", WEXITSTATUS(status));
+ SKIP(return, "Child process failed");
+ }
+
+ TH_LOG("Subprocess and all threads have exited successfully");
+
+ /* Verify namespaces are now inactive - open_by_handle_at should fail */
+ TH_LOG("Verifying namespaces are inactive after subprocess and threads exit");
+ user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_LT(user_fd, 0);
+ TH_LOG("User namespace inactive as expected: %s (errno=%d)",
+ strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+
+ net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ ASSERT_LT(net_fd, 0);
+ TH_LOG("Network namespace inactive as expected: %s (errno=%d)",
+ strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+
+ /* Verify namespaces do NOT appear in listns() */
+ TH_LOG("Verifying namespaces do NOT appear in listns() when inactive");
+ memset(&req, 0, sizeof(req));
+ req.size = sizeof(struct ns_id_req);
+ req.ns_type = CLONE_NEWUSER;
+ nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids >= 0) {
+ int found_user = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == user_id) {
+ found_user = 1;
+ break;
+ }
+ }
+ ASSERT_FALSE(found_user);
+ TH_LOG("User namespace correctly not listed in listns()");
+
+ /* Check network namespace */
+ req.ns_type = CLONE_NEWNET;
+ nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids >= 0) {
+ int found_net = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == net_id) {
+ found_net = 1;
+ break;
+ }
+ }
+ ASSERT_FALSE(found_net);
+ TH_LOG("Network namespace correctly not listed in listns()");
+ }
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c
index e28accd74a57..527ade0a8673 100644
--- a/tools/testing/selftests/namespaces/nsid_test.c
+++ b/tools/testing/selftests/namespaces/nsid_test.c
@@ -6,6 +6,7 @@
#include <libgen.h>
#include <limits.h>
#include <pthread.h>
+#include <signal.h>
#include <string.h>
#include <sys/mount.h>
#include <poll.h>
@@ -14,12 +15,30 @@
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/un.h>
+#include <sys/wait.h>
#include <unistd.h>
#include <linux/fs.h>
#include <linux/limits.h>
#include <linux/nsfs.h>
#include "../kselftest_harness.h"
+/* Fixture for tests that create child processes */
+FIXTURE(nsid) {
+ pid_t child_pid;
+};
+
+FIXTURE_SETUP(nsid) {
+ self->child_pid = 0;
+}
+
+FIXTURE_TEARDOWN(nsid) {
+ /* Clean up any child process that may still be running */
+ if (self->child_pid > 0) {
+ kill(self->child_pid, SIGKILL);
+ waitpid(self->child_pid, NULL, 0);
+ }
+}
+
TEST(nsid_mntns_basic)
{
__u64 mnt_ns_id = 0;
@@ -44,7 +63,7 @@ TEST(nsid_mntns_basic)
close(fd_mntns);
}
-TEST(nsid_mntns_separate)
+TEST_F(nsid, mntns_separate)
{
__u64 parent_mnt_ns_id = 0;
__u64 child_mnt_ns_id = 0;
@@ -90,6 +109,9 @@ TEST(nsid_mntns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -99,8 +121,6 @@ TEST(nsid_mntns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_mntns);
SKIP(return, "No permission to create mount namespace");
}
@@ -123,10 +143,6 @@ TEST(nsid_mntns_separate)
close(fd_parent_mntns);
close(fd_child_mntns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_cgroupns_basic)
@@ -153,7 +169,7 @@ TEST(nsid_cgroupns_basic)
close(fd_cgroupns);
}
-TEST(nsid_cgroupns_separate)
+TEST_F(nsid, cgroupns_separate)
{
__u64 parent_cgroup_ns_id = 0;
__u64 child_cgroup_ns_id = 0;
@@ -199,6 +215,9 @@ TEST(nsid_cgroupns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -208,8 +227,6 @@ TEST(nsid_cgroupns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_cgroupns);
SKIP(return, "No permission to create cgroup namespace");
}
@@ -232,10 +249,6 @@ TEST(nsid_cgroupns_separate)
close(fd_parent_cgroupns);
close(fd_child_cgroupns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_ipcns_basic)
@@ -262,7 +275,7 @@ TEST(nsid_ipcns_basic)
close(fd_ipcns);
}
-TEST(nsid_ipcns_separate)
+TEST_F(nsid, ipcns_separate)
{
__u64 parent_ipc_ns_id = 0;
__u64 child_ipc_ns_id = 0;
@@ -308,6 +321,9 @@ TEST(nsid_ipcns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -317,8 +333,6 @@ TEST(nsid_ipcns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_ipcns);
SKIP(return, "No permission to create IPC namespace");
}
@@ -341,10 +355,6 @@ TEST(nsid_ipcns_separate)
close(fd_parent_ipcns);
close(fd_child_ipcns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_utsns_basic)
@@ -371,7 +381,7 @@ TEST(nsid_utsns_basic)
close(fd_utsns);
}
-TEST(nsid_utsns_separate)
+TEST_F(nsid, utsns_separate)
{
__u64 parent_uts_ns_id = 0;
__u64 child_uts_ns_id = 0;
@@ -417,6 +427,9 @@ TEST(nsid_utsns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -426,8 +439,6 @@ TEST(nsid_utsns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_utsns);
SKIP(return, "No permission to create UTS namespace");
}
@@ -450,10 +461,6 @@ TEST(nsid_utsns_separate)
close(fd_parent_utsns);
close(fd_child_utsns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_userns_basic)
@@ -480,7 +487,7 @@ TEST(nsid_userns_basic)
close(fd_userns);
}
-TEST(nsid_userns_separate)
+TEST_F(nsid, userns_separate)
{
__u64 parent_user_ns_id = 0;
__u64 child_user_ns_id = 0;
@@ -526,6 +533,9 @@ TEST(nsid_userns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -535,8 +545,6 @@ TEST(nsid_userns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_userns);
SKIP(return, "No permission to create user namespace");
}
@@ -559,10 +567,6 @@ TEST(nsid_userns_separate)
close(fd_parent_userns);
close(fd_child_userns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_timens_basic)
@@ -591,7 +595,7 @@ TEST(nsid_timens_basic)
close(fd_timens);
}
-TEST(nsid_timens_separate)
+TEST_F(nsid, timens_separate)
{
__u64 parent_time_ns_id = 0;
__u64 child_time_ns_id = 0;
@@ -652,6 +656,9 @@ TEST(nsid_timens_separate)
}
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -660,8 +667,6 @@ TEST(nsid_timens_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_timens);
close(pipefd[0]);
SKIP(return, "Cannot create time namespace");
@@ -689,10 +694,6 @@ TEST(nsid_timens_separate)
close(fd_parent_timens);
close(fd_child_timens);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_pidns_basic)
@@ -719,7 +720,7 @@ TEST(nsid_pidns_basic)
close(fd_pidns);
}
-TEST(nsid_pidns_separate)
+TEST_F(nsid, pidns_separate)
{
__u64 parent_pid_ns_id = 0;
__u64 child_pid_ns_id = 0;
@@ -776,6 +777,9 @@ TEST(nsid_pidns_separate)
}
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -784,8 +788,6 @@ TEST(nsid_pidns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_pidns);
close(pipefd[0]);
SKIP(return, "No permission to create PID namespace");
@@ -813,10 +815,6 @@ TEST(nsid_pidns_separate)
close(fd_parent_pidns);
close(fd_child_pidns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_netns_basic)
@@ -860,7 +858,7 @@ TEST(nsid_netns_basic)
close(fd_netns);
}
-TEST(nsid_netns_separate)
+TEST_F(nsid, netns_separate)
{
__u64 parent_net_ns_id = 0;
__u64 parent_netns_cookie = 0;
@@ -920,6 +918,9 @@ TEST(nsid_netns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -929,8 +930,6 @@ TEST(nsid_netns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_netns);
close(parent_sock);
SKIP(return, "No permission to create network namespace");
@@ -977,10 +976,6 @@ TEST(nsid_netns_separate)
close(fd_parent_netns);
close(fd_child_netns);
close(parent_sock);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c
new file mode 100644
index 000000000000..753fd29dffd8
--- /dev/null
+++ b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "../pidfd/pidfd.h"
+#include "../kselftest_harness.h"
+
+/*
+ * Regression tests for the setns(pidfd) active reference counting bug.
+ *
+ * These tests are based on the reproducers that triggered the race condition
+ * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly").
+ *
+ * The bug: When using setns() with a pidfd, if the target task exits between
+ * prepare_nsset() and commit_nsset(), the namespaces would become inactive.
+ * Then ns_ref_active_get() would increment from 0 without properly resurrecting
+ * the owner chain, causing active reference count underflows.
+ */
+
+/*
+ * Simple pidfd setns test using create_child()+unshare().
+ *
+ * Without the fix, this would trigger active refcount warnings when the
+ * parent exits after doing setns(pidfd) on a child that has already exited.
+ */
+TEST(simple_pidfd_setns)
+{
+ pid_t child_pid;
+ int pidfd = -1;
+ int ret;
+ int sv[2];
+ char c;
+
+ /* Ignore SIGCHLD for autoreap */
+ ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create a child process without namespaces initially */
+ child_pid = create_child(&pidfd, 0);
+ ASSERT_GE(child_pid, 0);
+
+ if (child_pid == 0) {
+ close(sv[0]);
+
+ if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) {
+ close(sv[1]);
+ _exit(1);
+ }
+
+ /* Signal parent that namespaces are ready */
+ if (write_nointr(sv[1], "1", 1) < 0) {
+ close(sv[1]);
+ _exit(1);
+ }
+
+ close(sv[1]);
+ _exit(0);
+ }
+ ASSERT_GE(pidfd, 0);
+ EXPECT_EQ(close(sv[1]), 0);
+
+ ret = read_nointr(sv[0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ EXPECT_EQ(close(sv[0]), 0);
+
+ /* Set to child's namespaces via pidfd */
+ ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC);
+ TH_LOG("setns() returned %d", ret);
+ close(pidfd);
+}
+
+/*
+ * Simple pidfd setns test using create_child().
+ *
+ * This variation uses create_child() with namespace flags directly.
+ * Namespaces are created immediately at clone time.
+ */
+TEST(simple_pidfd_setns_clone)
+{
+ pid_t child_pid;
+ int pidfd = -1;
+ int ret;
+
+ /* Ignore SIGCHLD for autoreap */
+ ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR);
+
+ /* Create a child process with new namespaces using create_child() */
+ child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET);
+ ASSERT_GE(child_pid, 0);
+
+ if (child_pid == 0) {
+ /* Child: sleep for a while so parent can setns to us */
+ sleep(2);
+ _exit(0);
+ }
+
+ /* Parent: pidfd was already created by create_child() */
+ ASSERT_GE(pidfd, 0);
+
+ /* Set to child's namespaces via pidfd */
+ ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC);
+ close(pidfd);
+ TH_LOG("setns() returned %d", ret);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/siocgskns_test.c b/tools/testing/selftests/namespaces/siocgskns_test.c
new file mode 100644
index 000000000000..ba689a22d82f
--- /dev/null
+++ b/tools/testing/selftests/namespaces/siocgskns_test.c
@@ -0,0 +1,1824 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/if.h>
+#include <linux/sockios.h>
+#include <linux/nsfs.h>
+#include <arpa/inet.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+#ifndef SIOCGSKNS
+#define SIOCGSKNS 0x894C
+#endif
+
+#ifndef FD_NSFS_ROOT
+#define FD_NSFS_ROOT -10003
+#endif
+
+#ifndef FILEID_NSFS
+#define FILEID_NSFS 0xf1
+#endif
+
+/*
+ * Test basic SIOCGSKNS functionality.
+ * Create a socket and verify SIOCGSKNS returns the correct network namespace.
+ */
+TEST(siocgskns_basic)
+{
+ int sock_fd, netns_fd, current_netns_fd;
+ struct stat st1, st2;
+
+ /* Create a TCP socket */
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(sock_fd, 0);
+
+ /* Use SIOCGSKNS to get network namespace */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Get current network namespace */
+ current_netns_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(current_netns_fd, 0);
+
+ /* Verify they match */
+ ASSERT_EQ(fstat(netns_fd, &st1), 0);
+ ASSERT_EQ(fstat(current_netns_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ close(sock_fd);
+ close(netns_fd);
+ close(current_netns_fd);
+}
+
+/*
+ * Test that socket file descriptors keep network namespaces active.
+ * Create a network namespace, create a socket in it, then exit the namespace.
+ * The namespace should remain active while the socket FD is held.
+ */
+TEST(siocgskns_keeps_netns_active)
+{
+ int sock_fd, netns_fd, test_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ struct stat st;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new netns and socket */
+ close(ipc_sockets[0]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno));
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Create a socket in the new network namespace */
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ TH_LOG("socket() failed: %s", strerror(errno));
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ ASSERT_EQ(fstat(netns_fd, &st), 0);
+
+ /*
+ * Namespace should still be active because socket FD keeps it alive.
+ * Try to access it via /proc/self/fd/<fd>.
+ */
+ char path[64];
+ snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fd);
+ test_fd = open(path, O_RDONLY);
+ ASSERT_GE(test_fd, 0);
+ close(test_fd);
+ close(netns_fd);
+
+ /* Close socket - namespace should become inactive */
+ close(sock_fd);
+
+ /* Try SIOCGSKNS again - should fail since socket is closed */
+ ASSERT_LT(ioctl(sock_fd, SIOCGSKNS), 0);
+}
+
+/*
+ * Test SIOCGSKNS with different socket types (TCP, UDP, RAW).
+ */
+TEST(siocgskns_socket_types)
+{
+ int sock_tcp, sock_udp, sock_raw;
+ int netns_tcp, netns_udp, netns_raw;
+ struct stat st_tcp, st_udp, st_raw;
+
+ /* TCP socket */
+ sock_tcp = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(sock_tcp, 0);
+
+ /* UDP socket */
+ sock_udp = socket(AF_INET, SOCK_DGRAM, 0);
+ ASSERT_GE(sock_udp, 0);
+
+ /* RAW socket (may require privileges) */
+ sock_raw = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP);
+ if (sock_raw < 0 && (errno == EPERM || errno == EACCES)) {
+ sock_raw = -1; /* Skip raw socket test */
+ }
+
+ /* Test SIOCGSKNS on TCP */
+ netns_tcp = ioctl(sock_tcp, SIOCGSKNS);
+ if (netns_tcp < 0) {
+ close(sock_tcp);
+ close(sock_udp);
+ if (sock_raw >= 0) close(sock_raw);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_tcp, 0);
+ }
+
+ /* Test SIOCGSKNS on UDP */
+ netns_udp = ioctl(sock_udp, SIOCGSKNS);
+ ASSERT_GE(netns_udp, 0);
+
+ /* Test SIOCGSKNS on RAW (if available) */
+ if (sock_raw >= 0) {
+ netns_raw = ioctl(sock_raw, SIOCGSKNS);
+ ASSERT_GE(netns_raw, 0);
+ }
+
+ /* Verify all return the same network namespace */
+ ASSERT_EQ(fstat(netns_tcp, &st_tcp), 0);
+ ASSERT_EQ(fstat(netns_udp, &st_udp), 0);
+ ASSERT_EQ(st_tcp.st_ino, st_udp.st_ino);
+
+ if (sock_raw >= 0) {
+ ASSERT_EQ(fstat(netns_raw, &st_raw), 0);
+ ASSERT_EQ(st_tcp.st_ino, st_raw.st_ino);
+ close(netns_raw);
+ close(sock_raw);
+ }
+
+ close(netns_tcp);
+ close(netns_udp);
+ close(sock_tcp);
+ close(sock_udp);
+}
+
+/*
+ * Test SIOCGSKNS across setns.
+ * Create a socket in netns A, switch to netns B, verify SIOCGSKNS still
+ * returns netns A.
+ */
+TEST(siocgskns_across_setns)
+{
+ int sock_fd, netns_a_fd, netns_b_fd, result_fd;
+ struct stat st_a;
+
+ /* Get current netns (A) */
+ netns_a_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(netns_a_fd, 0);
+ ASSERT_EQ(fstat(netns_a_fd, &st_a), 0);
+
+ /* Create socket in netns A */
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(sock_fd, 0);
+
+ /* Create new netns (B) */
+ ASSERT_EQ(unshare(CLONE_NEWNET), 0);
+
+ netns_b_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(netns_b_fd, 0);
+
+ /* Get netns from socket created in A */
+ result_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (result_fd < 0) {
+ close(sock_fd);
+ setns(netns_a_fd, CLONE_NEWNET);
+ close(netns_a_fd);
+ close(netns_b_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(result_fd, 0);
+ }
+
+ /* Verify it still points to netns A */
+ struct stat st_result_stat;
+ ASSERT_EQ(fstat(result_fd, &st_result_stat), 0);
+ ASSERT_EQ(st_a.st_ino, st_result_stat.st_ino);
+
+ close(result_fd);
+ close(sock_fd);
+ close(netns_b_fd);
+
+ /* Restore original netns */
+ ASSERT_EQ(setns(netns_a_fd, CLONE_NEWNET), 0);
+ close(netns_a_fd);
+}
+
+/*
+ * Test SIOCGSKNS fails on non-socket file descriptors.
+ */
+TEST(siocgskns_non_socket)
+{
+ int fd;
+ int pipefd[2];
+
+ /* Test on regular file */
+ fd = open("/dev/null", O_RDONLY);
+ ASSERT_GE(fd, 0);
+
+ ASSERT_LT(ioctl(fd, SIOCGSKNS), 0);
+ ASSERT_TRUE(errno == ENOTTY || errno == EINVAL);
+ close(fd);
+
+ /* Test on pipe */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ ASSERT_LT(ioctl(pipefd[0], SIOCGSKNS), 0);
+ ASSERT_TRUE(errno == ENOTTY || errno == EINVAL);
+
+ close(pipefd[0]);
+ close(pipefd[1]);
+}
+
+/*
+ * Test multiple sockets keep the same network namespace active.
+ * Create multiple sockets, verify closing some doesn't affect others.
+ */
+TEST(siocgskns_multiple_sockets)
+{
+ int socks[5];
+ int netns_fds[5];
+ int i;
+ struct stat st;
+ ino_t netns_ino;
+
+ /* Create new network namespace */
+ ASSERT_EQ(unshare(CLONE_NEWNET), 0);
+
+ /* Create multiple sockets */
+ for (i = 0; i < 5; i++) {
+ socks[i] = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(socks[i], 0);
+ }
+
+ /* Get netns from all sockets */
+ for (i = 0; i < 5; i++) {
+ netns_fds[i] = ioctl(socks[i], SIOCGSKNS);
+ if (netns_fds[i] < 0) {
+ int j;
+ for (j = 0; j <= i; j++) {
+ close(socks[j]);
+ if (j < i && netns_fds[j] >= 0)
+ close(netns_fds[j]);
+ }
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fds[i], 0);
+ }
+ }
+
+ /* Verify all point to same netns */
+ ASSERT_EQ(fstat(netns_fds[0], &st), 0);
+ netns_ino = st.st_ino;
+
+ for (i = 1; i < 5; i++) {
+ ASSERT_EQ(fstat(netns_fds[i], &st), 0);
+ ASSERT_EQ(st.st_ino, netns_ino);
+ }
+
+ /* Close some sockets */
+ for (i = 0; i < 3; i++) {
+ close(socks[i]);
+ }
+
+ /* Remaining netns FDs should still be valid */
+ for (i = 3; i < 5; i++) {
+ char path[64];
+ snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fds[i]);
+ int test_fd = open(path, O_RDONLY);
+ ASSERT_GE(test_fd, 0);
+ close(test_fd);
+ }
+
+ /* Cleanup */
+ for (i = 0; i < 5; i++) {
+ if (i >= 3)
+ close(socks[i]);
+ close(netns_fds[i]);
+ }
+}
+
+/*
+ * Test socket keeps netns active after creating process exits.
+ * Verify that as long as the socket FD exists, the namespace remains active.
+ */
+TEST(siocgskns_netns_lifecycle)
+{
+ int sock_fd, netns_fd;
+ int ipc_sockets[2];
+ int syncpipe[2];
+ pid_t pid;
+ int status;
+ char sync_byte;
+ struct stat st;
+ ino_t netns_ino;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child */
+ close(ipc_sockets[0]);
+ close(syncpipe[1]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ /* Send socket to parent */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+
+ /* Wait for parent signal */
+ read(syncpipe[0], &sync_byte, 1);
+ close(syncpipe[0]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+
+ /* Receive socket FD */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Get netns from socket while child is alive */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+ close(sock_fd);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+ ASSERT_EQ(fstat(netns_fd, &st), 0);
+ netns_ino = st.st_ino;
+
+ /* Signal child to exit */
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ /*
+ * Socket FD should still keep namespace active even after
+ * the creating process exited.
+ */
+ int test_fd = ioctl(sock_fd, SIOCGSKNS);
+ ASSERT_GE(test_fd, 0);
+
+ struct stat st_test;
+ ASSERT_EQ(fstat(test_fd, &st_test), 0);
+ ASSERT_EQ(st_test.st_ino, netns_ino);
+
+ close(test_fd);
+ close(netns_fd);
+
+ /* Close socket - namespace should become inactive */
+ close(sock_fd);
+}
+
+/*
+ * Test IPv6 sockets also work with SIOCGSKNS.
+ */
+TEST(siocgskns_ipv6)
+{
+ int sock_fd, netns_fd, current_netns_fd;
+ struct stat st1, st2;
+
+ /* Create an IPv6 TCP socket */
+ sock_fd = socket(AF_INET6, SOCK_STREAM, 0);
+ ASSERT_GE(sock_fd, 0);
+
+ /* Use SIOCGSKNS */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Verify it matches current namespace */
+ current_netns_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(current_netns_fd, 0);
+
+ ASSERT_EQ(fstat(netns_fd, &st1), 0);
+ ASSERT_EQ(fstat(current_netns_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ close(sock_fd);
+ close(netns_fd);
+ close(current_netns_fd);
+}
+
+/*
+ * Test that socket-kept netns appears in listns() output.
+ * Verify that a network namespace kept alive by a socket FD appears in
+ * listns() output even after the creating process exits, and that it
+ * disappears when the socket is closed.
+ */
+TEST(siocgskns_listns_visibility)
+{
+ int sock_fd, netns_fd, owner_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ __u64 netns_id, owner_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int ret, i;
+ bool found_netns = false;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new netns and socket */
+ close(ipc_sockets[0]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Get namespace ID */
+ ret = ioctl(netns_fd, NS_GET_ID, &netns_id);
+ if (ret < 0) {
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Get owner user namespace */
+ owner_fd = ioctl(netns_fd, NS_GET_USERNS);
+ if (owner_fd < 0) {
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_USERNS not supported");
+ ASSERT_GE(owner_fd, 0);
+ }
+
+ /* Get owner namespace ID */
+ ret = ioctl(owner_fd, NS_GET_ID, &owner_id);
+ if (ret < 0) {
+ close(owner_fd);
+ close(sock_fd);
+ close(netns_fd);
+ ASSERT_EQ(ret, 0);
+ }
+ close(owner_fd);
+
+ /* Namespace should appear in listns() output */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ /* Search for our network namespace in the list */
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id) {
+ found_netns = true;
+ break;
+ }
+ }
+
+ ASSERT_TRUE(found_netns);
+ TH_LOG("Found netns %llu in listns() output (kept alive by socket)", netns_id);
+
+ /* Now verify with owner filtering */
+ req.user_ns_id = owner_id;
+ found_netns = false;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id) {
+ found_netns = true;
+ break;
+ }
+ }
+
+ ASSERT_TRUE(found_netns);
+ TH_LOG("Found netns %llu owned by userns %llu", netns_id, owner_id);
+
+ /* Close socket - namespace should become inactive and disappear from listns() */
+ close(sock_fd);
+ close(netns_fd);
+
+ /* Verify it's no longer in listns() output */
+ req.user_ns_id = 0;
+ found_netns = false;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id) {
+ found_netns = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found_netns);
+ TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id);
+}
+
+/*
+ * Test that socket-kept netns can be reopened via file handle.
+ * Verify that a network namespace kept alive by a socket FD can be
+ * reopened using file handles even after the creating process exits.
+ */
+TEST(siocgskns_file_handle)
+{
+ int sock_fd, netns_fd, reopened_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ struct stat st1, st2;
+ ino_t netns_ino;
+ __u64 netns_id;
+ struct file_handle *handle;
+ struct nsfs_file_handle *nsfs_fh;
+ int ret;
+
+ /* Allocate file_handle structure for nsfs */
+ handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle));
+ ASSERT_NE(handle, NULL);
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new netns and socket */
+ close(ipc_sockets[0]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ ASSERT_EQ(fstat(netns_fd, &st1), 0);
+ netns_ino = st1.st_ino;
+
+ /* Get namespace ID */
+ ret = ioctl(netns_fd, NS_GET_ID, &netns_id);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Construct file handle from namespace ID */
+ nsfs_fh = (struct nsfs_file_handle *)handle->f_handle;
+ nsfs_fh->ns_id = netns_id;
+ nsfs_fh->ns_type = 0; /* Type field not needed for reopening */
+ nsfs_fh->ns_inum = 0; /* Inum field not needed for reopening */
+
+ TH_LOG("Constructed file handle for netns %lu (id=%llu)", netns_ino, netns_id);
+
+ /* Reopen namespace using file handle (while socket still keeps it alive) */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ /* Verify it's the same namespace */
+ ASSERT_EQ(fstat(reopened_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ TH_LOG("Successfully reopened netns %lu via file handle", netns_ino);
+
+ close(reopened_fd);
+
+ /* Close the netns FD */
+ close(netns_fd);
+
+ /* Try to reopen via file handle - should fail since namespace is now inactive */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(reopened_fd, 0);
+ TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno));
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Reopen namespace using file handle (while socket still keeps it alive) */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ /* Verify it's the same namespace */
+ ASSERT_EQ(fstat(reopened_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ TH_LOG("Successfully reopened netns %lu via file handle", netns_ino);
+
+ /* Close socket - namespace should become inactive */
+ close(sock_fd);
+ free(handle);
+}
+
+/*
+ * Test combined listns() and file handle operations with socket-kept netns.
+ * Create a netns, keep it alive with a socket, verify it appears in listns(),
+ * then reopen it via file handle obtained from listns() entry.
+ */
+TEST(siocgskns_listns_and_file_handle)
+{
+ int sock_fd, netns_fd, userns_fd, reopened_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ struct stat st;
+ ino_t netns_ino;
+ __u64 netns_id, userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET | CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int ret, i;
+ bool found_netns = false, found_userns = false;
+ struct file_handle *handle;
+ struct nsfs_file_handle *nsfs_fh;
+
+ /* Allocate file_handle structure for nsfs */
+ handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle));
+ ASSERT_NE(handle, NULL);
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new userns and netns with socket */
+ close(ipc_sockets[0]);
+
+ if (setup_userns() < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ ASSERT_EQ(fstat(netns_fd, &st), 0);
+ netns_ino = st.st_ino;
+
+ /* Get namespace ID */
+ ret = ioctl(netns_fd, NS_GET_ID, &netns_id);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Get owner user namespace */
+ userns_fd = ioctl(netns_fd, NS_GET_USERNS);
+ if (userns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_USERNS not supported");
+ ASSERT_GE(userns_fd, 0);
+ }
+
+ /* Get owner namespace ID */
+ ret = ioctl(userns_fd, NS_GET_ID, &userns_id);
+ if (ret < 0) {
+ close(userns_fd);
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ ASSERT_EQ(ret, 0);
+ }
+ close(userns_fd);
+
+ TH_LOG("Testing netns %lu (id=%llu) owned by userns id=%llu", netns_ino, netns_id, userns_id);
+
+ /* Verify namespace appears in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ found_netns = false;
+ found_userns = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id)
+ found_netns = true;
+ if (ns_ids[i] == userns_id)
+ found_userns = true;
+ }
+ ASSERT_TRUE(found_netns);
+ ASSERT_TRUE(found_userns);
+ TH_LOG("Found netns %llu in listns() output", netns_id);
+
+ /* Construct file handle from namespace ID */
+ nsfs_fh = (struct nsfs_file_handle *)handle->f_handle;
+ nsfs_fh->ns_id = netns_id;
+ nsfs_fh->ns_type = 0;
+ nsfs_fh->ns_inum = 0;
+
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ struct stat reopened_st;
+ ASSERT_EQ(fstat(reopened_fd, &reopened_st), 0);
+ ASSERT_EQ(reopened_st.st_ino, netns_ino);
+
+ TH_LOG("Successfully reopened netns %lu via file handle (socket-kept)", netns_ino);
+
+ close(reopened_fd);
+ close(netns_fd);
+
+ /* Try to reopen via file handle - should fail since namespace is now inactive */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(reopened_fd, 0);
+ TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno));
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Verify namespace appears in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ found_netns = false;
+ found_userns = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id)
+ found_netns = true;
+ if (ns_ids[i] == userns_id)
+ found_userns = true;
+ }
+ ASSERT_TRUE(found_netns);
+ ASSERT_TRUE(found_userns);
+ TH_LOG("Found netns %llu in listns() output", netns_id);
+
+ close(netns_fd);
+
+ /* Verify namespace appears in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ found_netns = false;
+ found_userns = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id)
+ found_netns = true;
+ if (ns_ids[i] == userns_id)
+ found_userns = true;
+ }
+ ASSERT_FALSE(found_netns);
+ ASSERT_FALSE(found_userns);
+ TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id);
+
+ close(sock_fd);
+ free(handle);
+}
+
+/*
+ * Test multi-level namespace resurrection across three user namespace levels.
+ *
+ * This test creates a complex namespace hierarchy with three levels of user
+ * namespaces and a network namespace at the deepest level. It verifies that
+ * the resurrection semantics work correctly when SIOCGSKNS is called on a
+ * socket from an inactive namespace tree, and that listns() and
+ * open_by_handle_at() correctly respect visibility rules.
+ *
+ * Hierarchy after child processes exit (all with 0 active refcount):
+ *
+ * net_L3A (0) <- Level 3 network namespace
+ * |
+ * +
+ * userns_L3 (0) <- Level 3 user namespace
+ * |
+ * +
+ * userns_L2 (0) <- Level 2 user namespace
+ * |
+ * +
+ * userns_L1 (0) <- Level 1 user namespace
+ * |
+ * x
+ * init_user_ns
+ *
+ * The test verifies:
+ * 1. SIOCGSKNS on a socket from inactive net_L3A resurrects the entire chain
+ * 2. After resurrection, all namespaces are visible in listns()
+ * 3. Resurrected namespaces can be reopened via file handles
+ * 4. Closing the netns FD cascades down: the entire ownership chain
+ * (userns_L3 -> userns_L2 -> userns_L1) becomes inactive again
+ * 5. Inactive namespaces disappear from listns() and cannot be reopened
+ * 6. Calling SIOCGSKNS again on the same socket resurrects the tree again
+ * 7. After second resurrection, namespaces are visible and can be reopened
+ */
+TEST(siocgskns_multilevel_resurrection)
+{
+ int ipc_sockets[2];
+ pid_t pid_l1, pid_l2, pid_l3;
+ int status;
+
+ /* Namespace file descriptors to be received from child */
+ int sock_L3A_fd = -1;
+ int netns_L3A_fd = -1;
+ __u64 netns_L3A_id;
+ __u64 userns_L1_id, userns_L2_id, userns_L3_id;
+
+ /* For listns() and file handle testing */
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET | CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int ret, i;
+ struct file_handle *handle;
+ struct nsfs_file_handle *nsfs_fh;
+ int reopened_fd;
+
+ /* Allocate file handle for testing */
+ handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle));
+ ASSERT_NE(handle, NULL);
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ /*
+ * Fork level 1 child that creates userns_L1
+ */
+ pid_l1 = fork();
+ ASSERT_GE(pid_l1, 0);
+
+ if (pid_l1 == 0) {
+ /* Level 1 child */
+ int ipc_L2[2];
+ close(ipc_sockets[0]);
+
+ /* Create userns_L1 */
+ if (setup_userns() < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Create socketpair for communicating with L2 child */
+ if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L2) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /*
+ * Fork level 2 child that creates userns_L2
+ */
+ pid_l2 = fork();
+ if (pid_l2 < 0) {
+ close(ipc_sockets[1]);
+ close(ipc_L2[0]);
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ if (pid_l2 == 0) {
+ /* Level 2 child */
+ int ipc_L3[2];
+ close(ipc_L2[0]);
+
+ /* Create userns_L2 (nested inside userns_L1) */
+ if (setup_userns() < 0) {
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ /* Create socketpair for communicating with L3 child */
+ if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L3) < 0) {
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ /*
+ * Fork level 3 child that creates userns_L3 and network namespaces
+ */
+ pid_l3 = fork();
+ if (pid_l3 < 0) {
+ close(ipc_L2[1]);
+ close(ipc_L3[0]);
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ if (pid_l3 == 0) {
+ /* Level 3 child - the deepest level */
+ int sock_fd;
+ close(ipc_L3[0]);
+
+ /* Create userns_L3 (nested inside userns_L2) */
+ if (setup_userns() < 0) {
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ /* Create network namespace at level 3 */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ /* Create socket in net_L3A */
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to L2 parent */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_L3[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_L3[1]);
+ exit(0);
+ }
+
+ /* Level 2 child - receive from L3 and forward to L1 */
+ close(ipc_L3[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+ int received_fd;
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_L3[0], &msg, 0);
+ close(ipc_L3[0]);
+
+ if (n != 1) {
+ close(ipc_L2[1]);
+ waitpid(pid_l3, NULL, 0);
+ exit(1);
+ }
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg) {
+ close(ipc_L2[1]);
+ waitpid(pid_l3, NULL, 0);
+ exit(1);
+ }
+ memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for L3 child */
+ waitpid(pid_l3, NULL, 0);
+
+ /* Forward the socket FD to L1 parent */
+ memset(&msg, 0, sizeof(msg));
+ buf[0] = 'Y';
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int));
+
+ if (sendmsg(ipc_L2[1], &msg, 0) < 0) {
+ close(received_fd);
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ close(received_fd);
+ close(ipc_L2[1]);
+ exit(0);
+ }
+
+ /* Level 1 child - receive from L2 and forward to parent */
+ close(ipc_L2[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+ int received_fd;
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_L2[0], &msg, 0);
+ close(ipc_L2[0]);
+
+ if (n != 1) {
+ close(ipc_sockets[1]);
+ waitpid(pid_l2, NULL, 0);
+ exit(1);
+ }
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg) {
+ close(ipc_sockets[1]);
+ waitpid(pid_l2, NULL, 0);
+ exit(1);
+ }
+ memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for L2 child */
+ waitpid(pid_l2, NULL, 0);
+
+ /* Forward the socket FD to parent */
+ memset(&msg, 0, sizeof(msg));
+ buf[0] = 'Z';
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(received_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(received_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent - receive the socket from the deepest level */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+
+ if (n != 1) {
+ free(handle);
+ waitpid(pid_l1, NULL, 0);
+ SKIP(return, "Failed to receive socket from child");
+ }
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg) {
+ free(handle);
+ waitpid(pid_l1, NULL, 0);
+ SKIP(return, "Failed to receive socket from child");
+ }
+ memcpy(&sock_L3A_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for L1 child */
+ waitpid(pid_l1, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /*
+ * At this point, all child processes have exited. The socket itself
+ * doesn't keep the namespace active - we need to call SIOCGSKNS which
+ * will resurrect the entire namespace tree by taking active references.
+ */
+
+ /* Get network namespace from socket - this resurrects the tree */
+ netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS);
+ if (netns_L3A_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_L3A_fd, 0);
+ }
+
+ /* Get namespace ID for net_L3A */
+ ret = ioctl(netns_L3A_fd, NS_GET_ID, &netns_L3A_id);
+ if (ret < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Get owner user namespace chain: userns_L3 -> userns_L2 -> userns_L1 */
+ int userns_L3_fd = ioctl(netns_L3A_fd, NS_GET_USERNS);
+ if (userns_L3_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_USERNS not supported");
+ ASSERT_GE(userns_L3_fd, 0);
+ }
+
+ ret = ioctl(userns_L3_fd, NS_GET_ID, &userns_L3_id);
+ ASSERT_EQ(ret, 0);
+
+ int userns_L2_fd = ioctl(userns_L3_fd, NS_GET_USERNS);
+ ASSERT_GE(userns_L2_fd, 0);
+ ret = ioctl(userns_L2_fd, NS_GET_ID, &userns_L2_id);
+ ASSERT_EQ(ret, 0);
+
+ int userns_L1_fd = ioctl(userns_L2_fd, NS_GET_USERNS);
+ ASSERT_GE(userns_L1_fd, 0);
+ ret = ioctl(userns_L1_fd, NS_GET_ID, &userns_L1_id);
+ ASSERT_EQ(ret, 0);
+
+ close(userns_L1_fd);
+ close(userns_L2_fd);
+ close(userns_L3_fd);
+
+ TH_LOG("Multi-level hierarchy: net_L3A (id=%llu) -> userns_L3 (id=%llu) -> userns_L2 (id=%llu) -> userns_L1 (id=%llu)",
+ netns_L3A_id, userns_L3_id, userns_L2_id, userns_L1_id);
+
+ /*
+ * Test 1: Verify net_L3A is visible in listns() after resurrection.
+ * The entire ownership chain should be resurrected and visible.
+ */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ bool found_netns_L3A = false;
+ bool found_userns_L1 = false;
+ bool found_userns_L2 = false;
+ bool found_userns_L3 = false;
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_L3A_id)
+ found_netns_L3A = true;
+ if (ns_ids[i] == userns_L1_id)
+ found_userns_L1 = true;
+ if (ns_ids[i] == userns_L2_id)
+ found_userns_L2 = true;
+ if (ns_ids[i] == userns_L3_id)
+ found_userns_L3 = true;
+ }
+
+ ASSERT_TRUE(found_netns_L3A);
+ ASSERT_TRUE(found_userns_L1);
+ ASSERT_TRUE(found_userns_L2);
+ ASSERT_TRUE(found_userns_L3);
+ TH_LOG("Resurrection verified: all namespaces in hierarchy visible in listns()");
+
+ /*
+ * Test 2: Verify net_L3A can be reopened via file handle.
+ */
+ nsfs_fh = (struct nsfs_file_handle *)handle->f_handle;
+ nsfs_fh->ns_id = netns_L3A_id;
+ nsfs_fh->ns_type = 0;
+ nsfs_fh->ns_inum = 0;
+
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ close(reopened_fd);
+ TH_LOG("File handle test passed: net_L3A can be reopened");
+
+ /*
+ * Test 3: Verify that when we close the netns FD (dropping the last
+ * active reference), the entire tree becomes inactive and disappears
+ * from listns(). The cascade goes: net_L3A drops -> userns_L3 drops ->
+ * userns_L2 drops -> userns_L1 drops.
+ */
+ close(netns_L3A_fd);
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found_netns_L3A = false;
+ found_userns_L1 = false;
+ found_userns_L2 = false;
+ found_userns_L3 = false;
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_L3A_id)
+ found_netns_L3A = true;
+ if (ns_ids[i] == userns_L1_id)
+ found_userns_L1 = true;
+ if (ns_ids[i] == userns_L2_id)
+ found_userns_L2 = true;
+ if (ns_ids[i] == userns_L3_id)
+ found_userns_L3 = true;
+ }
+
+ ASSERT_FALSE(found_netns_L3A);
+ ASSERT_FALSE(found_userns_L1);
+ ASSERT_FALSE(found_userns_L2);
+ ASSERT_FALSE(found_userns_L3);
+ TH_LOG("Cascade test passed: all namespaces disappeared after netns FD closed");
+
+ /*
+ * Test 4: Verify file handle no longer works for inactive namespace.
+ */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd >= 0) {
+ close(reopened_fd);
+ free(handle);
+ ASSERT_TRUE(false); /* Should have failed */
+ }
+ TH_LOG("Inactive namespace correctly cannot be reopened via file handle");
+
+ /*
+ * Test 5: Verify that calling SIOCGSKNS again resurrects the tree again.
+ * The socket is still valid, so we can call SIOCGSKNS on it to resurrect
+ * the namespace tree once more.
+ */
+ netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS);
+ ASSERT_GE(netns_L3A_fd, 0);
+
+ TH_LOG("Called SIOCGSKNS again to resurrect the namespace tree");
+
+ /* Verify the namespace tree is resurrected and visible in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found_netns_L3A = false;
+ found_userns_L1 = false;
+ found_userns_L2 = false;
+ found_userns_L3 = false;
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_L3A_id)
+ found_netns_L3A = true;
+ if (ns_ids[i] == userns_L1_id)
+ found_userns_L1 = true;
+ if (ns_ids[i] == userns_L2_id)
+ found_userns_L2 = true;
+ if (ns_ids[i] == userns_L3_id)
+ found_userns_L3 = true;
+ }
+
+ ASSERT_TRUE(found_netns_L3A);
+ ASSERT_TRUE(found_userns_L1);
+ ASSERT_TRUE(found_userns_L2);
+ ASSERT_TRUE(found_userns_L3);
+ TH_LOG("Second resurrection verified: all namespaces in hierarchy visible in listns() again");
+
+ /* Verify we can reopen via file handle again */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ TH_LOG("open_by_handle_at failed after second resurrection: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ close(reopened_fd);
+ TH_LOG("File handle test passed: net_L3A can be reopened after second resurrection");
+
+ /* Final cleanup */
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ free(handle);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/stress_test.c b/tools/testing/selftests/namespaces/stress_test.c
new file mode 100644
index 000000000000..dd7df7d6cb27
--- /dev/null
+++ b/tools/testing/selftests/namespaces/stress_test.c
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/nsfs.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Stress tests for namespace active reference counting.
+ *
+ * These tests validate that the active reference counting system can handle
+ * high load scenarios including rapid namespace creation/destruction, large
+ * numbers of concurrent namespaces, and various edge cases under stress.
+ */
+
+/*
+ * Test rapid creation and destruction of user namespaces.
+ * Create and destroy namespaces in quick succession to stress the
+ * active reference tracking and ensure no leaks occur.
+ */
+TEST(rapid_namespace_creation_destruction)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[256], ns_ids_after[256];
+ ssize_t ret_before, ret_after;
+ int i;
+
+ /* Get baseline count of active user namespaces */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active user namespaces", ret_before);
+
+ /* Rapidly create and destroy 100 user namespaces */
+ for (i = 0; i < 100; i++) {
+ pid_t pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create user namespace and immediately exit */
+ if (setup_userns() < 0)
+ exit(1);
+ exit(0);
+ }
+
+ /* Parent: wait for child */
+ int status;
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+ }
+
+ /* Verify we're back to baseline (no leaked namespaces) */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 100 rapid create/destroy cycles: %zd active user namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test creating many concurrent namespaces.
+ * Verify that listns() correctly tracks all of them and that they all
+ * become inactive after processes exit.
+ */
+TEST(many_concurrent_namespaces)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_during[512], ns_ids_after[512];
+ ssize_t ret_before, ret_during, ret_after;
+ pid_t pids[50];
+ int num_children = 50;
+ int i;
+ int sv[2];
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active user namespaces", ret_before);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create many children, each with their own user namespace */
+ for (i = 0; i < num_children; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ /* Child: create user namespace and wait for parent signal */
+ char c;
+
+ close(sv[0]);
+
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Signal parent we're ready */
+ if (write(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal to exit */
+ if (read(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ close(sv[1]);
+ exit(0);
+ }
+ }
+
+ close(sv[1]);
+
+ /* Wait for all children to signal ready */
+ for (i = 0; i < num_children; i++) {
+ char c;
+ if (read(sv[0], &c, 1) != 1) {
+ /* If we fail to read, kill all children and exit */
+ close(sv[0]);
+ for (int j = 0; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ /* List namespaces while all children are running */
+ ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0);
+ ASSERT_GE(ret_during, 0);
+
+ TH_LOG("With %d children running: %zd active user namespaces", num_children, ret_during);
+
+ /* Should have at least num_children more namespaces than baseline */
+ ASSERT_GE(ret_during, ret_before + num_children);
+
+ /* Signal all children to exit */
+ for (i = 0; i < num_children; i++) {
+ char c = 'X';
+ if (write(sv[0], &c, 1) != 1) {
+ /* If we fail to write, kill remaining children */
+ close(sv[0]);
+ for (int j = i; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ close(sv[0]);
+
+ /* Wait for all children */
+ for (i = 0; i < num_children; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After all children exit: %zd active user namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test rapid namespace creation with different namespace types.
+ * Create multiple types of namespaces rapidly to stress the tracking system.
+ */
+TEST(rapid_mixed_namespace_creation)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ int i;
+
+ /* Get baseline count */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active namespaces (all types)", ret_before);
+
+ /* Rapidly create and destroy namespaces with multiple types */
+ for (i = 0; i < 50; i++) {
+ pid_t pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create multiple namespace types */
+ if (setup_userns() < 0)
+ exit(1);
+
+ /* Create additional namespace types */
+ if (unshare(CLONE_NEWNET) < 0)
+ exit(1);
+ if (unshare(CLONE_NEWUTS) < 0)
+ exit(1);
+ if (unshare(CLONE_NEWIPC) < 0)
+ exit(1);
+
+ exit(0);
+ }
+
+ /* Parent: wait for child */
+ int status;
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 50 rapid mixed namespace cycles: %zd active namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test nested namespace creation under stress.
+ * Create deeply nested namespace hierarchies and verify proper cleanup.
+ */
+TEST(nested_namespace_stress)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ int i;
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active user namespaces", ret_before);
+
+ /* Create 20 processes, each with nested user namespaces */
+ for (i = 0; i < 20; i++) {
+ pid_t pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int userns_fd;
+ uid_t orig_uid = getuid();
+ int depth;
+
+ /* Create nested user namespaces (up to 5 levels) */
+ for (depth = 0; depth < 5; depth++) {
+ userns_fd = get_userns_fd(0, (depth == 0) ? orig_uid : 0, 1);
+ if (userns_fd < 0)
+ exit(1);
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ exit(1);
+ }
+ close(userns_fd);
+ }
+
+ exit(0);
+ }
+
+ /* Parent: wait for child */
+ int status;
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 20 nested namespace hierarchies: %zd active user namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test listns() pagination under stress.
+ * Create many namespaces and verify pagination works correctly.
+ */
+TEST(listns_pagination_stress)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ pid_t pids[30];
+ int num_children = 30;
+ int i;
+ int sv[2];
+ __u64 all_ns_ids[512];
+ int total_found = 0;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create many children with user namespaces */
+ for (i = 0; i < num_children; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ char c;
+ close(sv[0]);
+
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Signal parent we're ready */
+ if (write(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal to exit */
+ if (read(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ close(sv[1]);
+ exit(0);
+ }
+ }
+
+ close(sv[1]);
+
+ /* Wait for all children to signal ready */
+ for (i = 0; i < num_children; i++) {
+ char c;
+ if (read(sv[0], &c, 1) != 1) {
+ /* If we fail to read, kill all children and exit */
+ close(sv[0]);
+ for (int j = 0; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ /* Paginate through all namespaces using small batch sizes */
+ req.ns_id = 0;
+ while (1) {
+ __u64 batch[5]; /* Small batch size to force pagination */
+ ssize_t ret;
+
+ ret = sys_listns(&req, batch, ARRAY_SIZE(batch), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS) {
+ close(sv[0]);
+ for (i = 0; i < num_children; i++)
+ kill(pids[i], SIGKILL);
+ for (i = 0; i < num_children; i++)
+ waitpid(pids[i], NULL, 0);
+ SKIP(return, "listns() not supported");
+ }
+ ASSERT_GE(ret, 0);
+ }
+
+ if (ret == 0)
+ break;
+
+ /* Store results */
+ for (i = 0; i < ret && total_found < 512; i++) {
+ all_ns_ids[total_found++] = batch[i];
+ }
+
+ /* Update cursor for next batch */
+ if (ret == ARRAY_SIZE(batch))
+ req.ns_id = batch[ret - 1];
+ else
+ break;
+ }
+
+ TH_LOG("Paginated through %d user namespaces", total_found);
+
+ /* Verify no duplicates in pagination */
+ for (i = 0; i < total_found; i++) {
+ for (int j = i + 1; j < total_found; j++) {
+ if (all_ns_ids[i] == all_ns_ids[j]) {
+ TH_LOG("Found duplicate ns_id: %llu at positions %d and %d",
+ (unsigned long long)all_ns_ids[i], i, j);
+ ASSERT_TRUE(false);
+ }
+ }
+ }
+
+ /* Signal all children to exit */
+ for (i = 0; i < num_children; i++) {
+ char c = 'X';
+ if (write(sv[0], &c, 1) != 1) {
+ close(sv[0]);
+ for (int j = i; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ close(sv[0]);
+
+ /* Wait for all children */
+ for (i = 0; i < num_children; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ }
+}
+
+/*
+ * Test concurrent namespace operations.
+ * Multiple processes creating, querying, and destroying namespaces concurrently.
+ */
+TEST(concurrent_namespace_operations)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ pid_t pids[20];
+ int num_workers = 20;
+ int i;
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active namespaces", ret_before);
+
+ /* Create worker processes that do concurrent operations */
+ for (i = 0; i < num_workers; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ /* Each worker: create namespaces, list them, repeat */
+ int iterations;
+
+ for (iterations = 0; iterations < 10; iterations++) {
+ int userns_fd;
+ __u64 temp_ns_ids[100];
+ ssize_t ret;
+
+ /* Create a user namespace */
+ userns_fd = get_userns_fd(0, getuid(), 1);
+ if (userns_fd < 0)
+ continue;
+
+ /* List namespaces */
+ ret = sys_listns(&req, temp_ns_ids, ARRAY_SIZE(temp_ns_ids), 0);
+ (void)ret;
+
+ close(userns_fd);
+
+ /* Small delay */
+ usleep(1000);
+ }
+
+ exit(0);
+ }
+ }
+
+ /* Wait for all workers */
+ for (i = 0; i < num_workers; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After concurrent operations: %zd active namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test namespace churn - continuous creation and destruction.
+ * Simulates high-churn scenarios like container orchestration.
+ */
+TEST(namespace_churn)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ int cycle;
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active namespaces", ret_before);
+
+ /* Simulate churn: batches of namespaces created and destroyed */
+ for (cycle = 0; cycle < 10; cycle++) {
+ pid_t batch_pids[10];
+ int i;
+
+ /* Create batch */
+ for (i = 0; i < 10; i++) {
+ batch_pids[i] = fork();
+ ASSERT_GE(batch_pids[i], 0);
+
+ if (batch_pids[i] == 0) {
+ /* Create multiple namespace types */
+ if (setup_userns() < 0)
+ exit(1);
+ if (unshare(CLONE_NEWNET) < 0)
+ exit(1);
+ if (unshare(CLONE_NEWUTS) < 0)
+ exit(1);
+
+ /* Keep namespaces alive briefly */
+ usleep(10000);
+ exit(0);
+ }
+ }
+
+ /* Wait for batch to complete */
+ for (i = 0; i < 10; i++) {
+ int status;
+ waitpid(batch_pids[i], &status, 0);
+ }
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 10 churn cycles (100 namespace sets): %zd active namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/wrappers.h b/tools/testing/selftests/namespaces/wrappers.h
new file mode 100644
index 000000000000..9741a64a5b1d
--- /dev/null
+++ b/tools/testing/selftests/namespaces/wrappers.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/nsfs.h>
+#include <linux/types.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#ifndef __SELFTESTS_NAMESPACES_WRAPPERS_H__
+#define __SELFTESTS_NAMESPACES_WRAPPERS_H__
+
+#ifndef __NR_listns
+ #if defined __alpha__
+ #define __NR_listns 580
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_listns 4470
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_listns 6470
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_listns 5470
+ #endif
+ #else
+ #define __NR_listns 470
+ #endif
+#endif
+
+static inline int sys_listns(const struct ns_id_req *req, __u64 *ns_ids,
+ size_t nr_ns_ids, unsigned int flags)
+{
+ return syscall(__NR_listns, req, ns_ids, nr_ns_ids, flags);
+}
+
+#endif /* __SELFTESTS_NAMESPACES_WRAPPERS_H__ */