diff options
Diffstat (limited to 'tools/testing/selftests')
81 files changed, 13297 insertions, 1938 deletions
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c index a85c19e9524e..0114108ab25f 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.c +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c @@ -1071,7 +1071,7 @@ static bool sve_write_supported(struct test_config *config) static bool sve_write_fpsimd_supported(struct test_config *config) { - if (!sve_supported()) + if (!sve_supported() && !sme_supported()) return false; if ((config->svcr_in & SVCR_ZA) != (config->svcr_expected & SVCR_ZA)) @@ -1231,9 +1231,6 @@ static void sve_write_fpsimd(pid_t child, struct test_config *config) vl = vl_expected(config); vq = __sve_vq_from_vl(vl); - if (!vl) - return; - iov.iov_len = SVE_PT_SIZE(vq, SVE_PT_REGS_FPSIMD); iov.iov_base = malloc(iov.iov_len); if (!iov.iov_base) { diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index e0fc3a001e28..f44d44618575 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -394,6 +394,58 @@ out: free(svebuf); } +/* Write the FPSIMD registers via the SVE regset when SVE is not supported */ +static void ptrace_sve_fpsimd_no_sve(pid_t child) +{ + void *svebuf; + struct user_sve_header *sve; + struct user_fpsimd_state *fpsimd, new_fpsimd; + unsigned int i, j; + unsigned char *p; + int ret; + + svebuf = malloc(SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD)); + if (!svebuf) { + ksft_test_result_fail("Failed to allocate FPSIMD buffer\n"); + return; + } + + /* On a system without SVE the VL should be set to 0 */ + memset(svebuf, 0, SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD)); + sve = svebuf; + sve->flags = SVE_PT_REGS_FPSIMD; + sve->size = SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD); + sve->vl = 0; + + /* Try to set a known FPSIMD state via PT_REGS_SVE */ + fpsimd = (struct user_fpsimd_state *)((char *)sve + + SVE_PT_FPSIMD_OFFSET); + for (i = 0; i < 32; ++i) { + p = (unsigned char *)&fpsimd->vregs[i]; + + for (j = 0; j < sizeof(fpsimd->vregs[i]); ++j) + p[j] = j; + } + + ret = set_sve(child, &vec_types[0], sve); + ksft_test_result(ret == 0, "FPSIMD write via SVE\n"); + if (ret) { + ksft_test_result_skip("Verify FPSIMD write via SVE\n"); + goto out; + } + + /* Verify via the FPSIMD regset */ + if (get_fpsimd(child, &new_fpsimd)) { + ksft_test_result_skip("Verify FPSIMD write via SVE\n"); + goto out; + } + ksft_test_result(memcmp(fpsimd, &new_fpsimd, sizeof(*fpsimd)) == 0, + "Verify FPSIMD write via SVE\n"); + +out: + free(svebuf); +} + /* Validate attempting to set SVE data and read SVE data */ static void ptrace_set_sve_get_sve_data(pid_t child, const struct vec_type *type, @@ -826,6 +878,15 @@ static int do_parent(pid_t child) } } + /* We support SVE writes of FPSMID format on SME only systems */ + if (!(getauxval(AT_HWCAP) & HWCAP_SVE) && + (getauxval(AT_HWCAP2) & HWCAP2_SME)) { + ptrace_sve_fpsimd_no_sve(child); + } else { + ksft_test_result_skip("FPSIMD write via SVE\n"); + ksft_test_result_skip("Verify FPSIMD write via SVE\n"); + } + ret = EXIT_SUCCESS; error: diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S index 38080f3c3280..a8df05771670 100644 --- a/tools/testing/selftests/arm64/fp/zt-test.S +++ b/tools/testing/selftests/arm64/fp/zt-test.S @@ -276,7 +276,7 @@ function barf bl putdec puts ", iteration=" mov x0, x22 - bl putdec + bl putdecn puts "\tExpected [" mov x0, x10 mov x1, x12 diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 70b28c1e653e..f2a2fd236ca8 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -50,6 +50,7 @@ CONFIG_IPV6_SIT=y CONFIG_IPV6_TUNNEL=y CONFIG_KEYS=y CONFIG_LIRC=y +CONFIG_LIVEPATCH=y CONFIG_LWTUNNEL=y CONFIG_MODULE_SIG=y CONFIG_MODULE_SRCVERSION_ALL=y @@ -111,6 +112,8 @@ CONFIG_IP6_NF_FILTER=y CONFIG_NF_NAT=y CONFIG_PACKET=y CONFIG_RC_CORE=y +CONFIG_SAMPLES=y +CONFIG_SAMPLE_LIVEPATCH=m CONFIG_SECURITY=y CONFIG_SECURITYFS=y CONFIG_SYN_COOKIES=y diff --git a/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c b/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c new file mode 100644 index 000000000000..72aa5376c30e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <test_progs.h> +#include "testing_helpers.h" +#include "livepatch_trampoline.skel.h" + +static int load_livepatch(void) +{ + char path[4096]; + + /* CI will set KBUILD_OUTPUT */ + snprintf(path, sizeof(path), "%s/samples/livepatch/livepatch-sample.ko", + getenv("KBUILD_OUTPUT") ? : "../../../.."); + + return load_module(path, env_verbosity > VERBOSE_NONE); +} + +static void unload_livepatch(void) +{ + /* Disable the livepatch before unloading the module */ + system("echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled"); + + unload_module("livepatch_sample", env_verbosity > VERBOSE_NONE); +} + +static void read_proc_cmdline(void) +{ + char buf[4096]; + int fd, ret; + + fd = open("/proc/cmdline", O_RDONLY); + if (!ASSERT_OK_FD(fd, "open /proc/cmdline")) + return; + + ret = read(fd, buf, sizeof(buf)); + if (!ASSERT_GT(ret, 0, "read /proc/cmdline")) + goto out; + + ASSERT_OK(strncmp(buf, "this has been live patched", 26), "strncmp"); + +out: + close(fd); +} + +static void __test_livepatch_trampoline(bool fexit_first) +{ + struct livepatch_trampoline *skel = NULL; + int err; + + skel = livepatch_trampoline__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + goto out; + + skel->bss->my_pid = getpid(); + + if (!fexit_first) { + /* fentry program is loaded first by default */ + err = livepatch_trampoline__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + } else { + /* Manually load fexit program first. */ + skel->links.fexit_cmdline = bpf_program__attach(skel->progs.fexit_cmdline); + if (!ASSERT_OK_PTR(skel->links.fexit_cmdline, "attach_fexit")) + goto out; + + skel->links.fentry_cmdline = bpf_program__attach(skel->progs.fentry_cmdline); + if (!ASSERT_OK_PTR(skel->links.fentry_cmdline, "attach_fentry")) + goto out; + } + + read_proc_cmdline(); + + ASSERT_EQ(skel->bss->fentry_hit, 1, "fentry_hit"); + ASSERT_EQ(skel->bss->fexit_hit, 1, "fexit_hit"); +out: + livepatch_trampoline__destroy(skel); +} + +void test_livepatch_trampoline(void) +{ + int retry_cnt = 0; + +retry: + if (load_livepatch()) { + if (retry_cnt) { + ASSERT_OK(1, "load_livepatch"); + goto out; + } + /* + * Something else (previous run of the same test?) loaded + * the KLP module. Unload the KLP module and retry. + */ + unload_livepatch(); + retry_cnt++; + goto retry; + } + + if (test__start_subtest("fentry_first")) + __test_livepatch_trampoline(false); + + if (test__start_subtest("fexit_first")) + __test_livepatch_trampoline(true); +out: + unload_livepatch(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index f8eb7f9d4fd2..8fade8bdc451 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -6,11 +6,13 @@ #include <netinet/in.h> #include <test_progs.h> #include <unistd.h> +#include <errno.h> #include "cgroup_helpers.h" #include "network_helpers.h" #include "mptcp_sock.skel.h" #include "mptcpify.skel.h" #include "mptcp_subflow.skel.h" +#include "mptcp_sockmap.skel.h" #define NS_TEST "mptcp_ns" #define ADDR_1 "10.0.1.1" @@ -436,6 +438,142 @@ close_cgroup: close(cgroup_fd); } +/* Test sockmap on MPTCP server handling non-mp-capable clients. */ +static void test_sockmap_with_mptcp_fallback(struct mptcp_sockmap *skel) +{ + int listen_fd = -1, client_fd1 = -1, client_fd2 = -1; + int server_fd1 = -1, server_fd2 = -1, sent, recvd; + char snd[9] = "123456789"; + char rcv[10]; + + /* start server with MPTCP enabled */ + listen_fd = start_mptcp_server(AF_INET, NULL, 0, 0); + if (!ASSERT_OK_FD(listen_fd, "sockmap-fb:start_mptcp_server")) + return; + + skel->bss->trace_port = ntohs(get_socket_local_port(listen_fd)); + skel->bss->sk_index = 0; + /* create client without MPTCP enabled */ + client_fd1 = connect_to_fd_opts(listen_fd, NULL); + if (!ASSERT_OK_FD(client_fd1, "sockmap-fb:connect_to_fd")) + goto end; + + server_fd1 = accept(listen_fd, NULL, 0); + skel->bss->sk_index = 1; + client_fd2 = connect_to_fd_opts(listen_fd, NULL); + if (!ASSERT_OK_FD(client_fd2, "sockmap-fb:connect_to_fd")) + goto end; + + server_fd2 = accept(listen_fd, NULL, 0); + /* test normal redirect behavior: data sent by client_fd1 can be + * received by client_fd2 + */ + skel->bss->redirect_idx = 1; + sent = send(client_fd1, snd, sizeof(snd), 0); + if (!ASSERT_EQ(sent, sizeof(snd), "sockmap-fb:send(client_fd1)")) + goto end; + + /* try to recv more bytes to avoid truncation check */ + recvd = recv(client_fd2, rcv, sizeof(rcv), 0); + if (!ASSERT_EQ(recvd, sizeof(snd), "sockmap-fb:recv(client_fd2)")) + goto end; + +end: + if (client_fd1 >= 0) + close(client_fd1); + if (client_fd2 >= 0) + close(client_fd2); + if (server_fd1 >= 0) + close(server_fd1); + if (server_fd2 >= 0) + close(server_fd2); + close(listen_fd); +} + +/* Test sockmap rejection of MPTCP sockets - both server and client sides. */ +static void test_sockmap_reject_mptcp(struct mptcp_sockmap *skel) +{ + int listen_fd = -1, server_fd = -1, client_fd1 = -1; + int err, zero = 0; + + /* start server with MPTCP enabled */ + listen_fd = start_mptcp_server(AF_INET, NULL, 0, 0); + if (!ASSERT_OK_FD(listen_fd, "start_mptcp_server")) + return; + + skel->bss->trace_port = ntohs(get_socket_local_port(listen_fd)); + skel->bss->sk_index = 0; + /* create client with MPTCP enabled */ + client_fd1 = connect_to_fd(listen_fd, 0); + if (!ASSERT_OK_FD(client_fd1, "connect_to_fd client_fd1")) + goto end; + + /* bpf_sock_map_update() called from sockops should reject MPTCP sk */ + if (!ASSERT_EQ(skel->bss->helper_ret, -EOPNOTSUPP, "should reject")) + goto end; + + server_fd = accept(listen_fd, NULL, 0); + err = bpf_map_update_elem(bpf_map__fd(skel->maps.sock_map), + &zero, &server_fd, BPF_NOEXIST); + if (!ASSERT_EQ(err, -EOPNOTSUPP, "server should be disallowed")) + goto end; + + /* MPTCP client should also be disallowed */ + err = bpf_map_update_elem(bpf_map__fd(skel->maps.sock_map), + &zero, &client_fd1, BPF_NOEXIST); + if (!ASSERT_EQ(err, -EOPNOTSUPP, "client should be disallowed")) + goto end; +end: + if (client_fd1 >= 0) + close(client_fd1); + if (server_fd >= 0) + close(server_fd); + close(listen_fd); +} + +static void test_mptcp_sockmap(void) +{ + struct mptcp_sockmap *skel; + struct netns_obj *netns; + int cgroup_fd, err; + + cgroup_fd = test__join_cgroup("/mptcp_sockmap"); + if (!ASSERT_OK_FD(cgroup_fd, "join_cgroup: mptcp_sockmap")) + return; + + skel = mptcp_sockmap__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_load: mptcp_sockmap")) + goto close_cgroup; + + skel->links.mptcp_sockmap_inject = + bpf_program__attach_cgroup(skel->progs.mptcp_sockmap_inject, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.mptcp_sockmap_inject, "attach sockmap")) + goto skel_destroy; + + err = bpf_prog_attach(bpf_program__fd(skel->progs.mptcp_sockmap_redirect), + bpf_map__fd(skel->maps.sock_map), + BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach stream verdict")) + goto skel_destroy; + + netns = netns_new(NS_TEST, true); + if (!ASSERT_OK_PTR(netns, "netns_new: mptcp_sockmap")) + goto skel_destroy; + + if (endpoint_init("subflow") < 0) + goto close_netns; + + test_sockmap_with_mptcp_fallback(skel); + test_sockmap_reject_mptcp(skel); + +close_netns: + netns_free(netns); +skel_destroy: + mptcp_sockmap__destroy(skel); +close_cgroup: + close(cgroup_fd); +} + void test_mptcp(void) { if (test__start_subtest("base")) @@ -444,4 +582,6 @@ void test_mptcp(void) test_mptcpify(); if (test__start_subtest("subflow")) test_subflow(); + if (test__start_subtest("sockmap")) + test_mptcp_sockmap(); } diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c new file mode 100644 index 000000000000..c9efdd2a5b18 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include "stacktrace_ips.skel.h" + +#ifdef __x86_64__ +static int check_stacktrace_ips(int fd, __u32 key, int cnt, ...) +{ + __u64 ips[PERF_MAX_STACK_DEPTH]; + struct ksyms *ksyms = NULL; + int i, err = 0; + va_list args; + + /* sorted by addr */ + ksyms = load_kallsyms_local(); + if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_local")) + return -1; + + /* unlikely, but... */ + if (!ASSERT_LT(cnt, PERF_MAX_STACK_DEPTH, "check_max")) + return -1; + + err = bpf_map_lookup_elem(fd, &key, ips); + if (err) + goto out; + + /* + * Compare all symbols provided via arguments with stacktrace ips, + * and their related symbol addresses.t + */ + va_start(args, cnt); + + for (i = 0; i < cnt; i++) { + unsigned long val; + struct ksym *ksym; + + val = va_arg(args, unsigned long); + ksym = ksym_search_local(ksyms, ips[i]); + if (!ASSERT_OK_PTR(ksym, "ksym_search_local")) + break; + ASSERT_EQ(ksym->addr, val, "stack_cmp"); + } + + va_end(args); + +out: + free_kallsyms_local(ksyms); + return err; +} + +static void test_stacktrace_ips_kprobe_multi(bool retprobe) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts, + .retprobe = retprobe + ); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct stacktrace_ips *skel; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + skel->links.kprobe_multi_test = bpf_program__attach_kprobe_multi_opts( + skel->progs.kprobe_multi_test, + "bpf_testmod_stacktrace_test", &opts); + if (!ASSERT_OK_PTR(skel->links.kprobe_multi_test, "bpf_program__attach_kprobe_multi_opts")) + goto cleanup; + + trigger_module_test_read(1); + + load_kallsyms(); + + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + +cleanup: + stacktrace_ips__destroy(skel); +} + +static void test_stacktrace_ips_raw_tp(void) +{ + __u32 info_len = sizeof(struct bpf_prog_info); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_prog_info info = {}; + struct stacktrace_ips *skel; + __u64 bpf_prog_ksym = 0; + int err; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + skel->links.rawtp_test = bpf_program__attach_raw_tracepoint( + skel->progs.rawtp_test, + "bpf_testmod_test_read"); + if (!ASSERT_OK_PTR(skel->links.rawtp_test, "bpf_program__attach_raw_tracepoint")) + goto cleanup; + + /* get bpf program address */ + info.jited_ksyms = ptr_to_u64(&bpf_prog_ksym); + info.nr_jited_ksyms = 1; + err = bpf_prog_get_info_by_fd(bpf_program__fd(skel->progs.rawtp_test), + &info, &info_len); + if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd")) + goto cleanup; + + trigger_module_test_read(1); + + load_kallsyms(); + + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 2, + bpf_prog_ksym, + ksym_get_addr("bpf_trace_run2")); + +cleanup: + stacktrace_ips__destroy(skel); +} + +static void __test_stacktrace_ips(void) +{ + if (test__start_subtest("kprobe_multi")) + test_stacktrace_ips_kprobe_multi(false); + if (test__start_subtest("kretprobe_multi")) + test_stacktrace_ips_kprobe_multi(true); + if (test__start_subtest("raw_tp")) + test_stacktrace_ips_raw_tp(); +} +#else +static void __test_stacktrace_ips(void) +{ + test__skip(); +} +#endif + +void test_stacktrace_ips(void) +{ + __test_stacktrace_ips(); +} diff --git a/tools/testing/selftests/bpf/progs/iters_looping.c b/tools/testing/selftests/bpf/progs/iters_looping.c index 05fa5ce7fc59..d00fd570255a 100644 --- a/tools/testing/selftests/bpf/progs/iters_looping.c +++ b/tools/testing/selftests/bpf/progs/iters_looping.c @@ -161,3 +161,56 @@ int simplest_loop(void *ctx) return 0; } + +__used +static void iterator_with_diff_stack_depth(int x) +{ + struct bpf_iter_num iter; + + asm volatile ( + "if r1 == 42 goto 0f;" + "*(u64 *)(r10 - 128) = 0;" + "0:" + /* create iterator */ + "r1 = %[iter];" + "r2 = 0;" + "r3 = 10;" + "call %[bpf_iter_num_new];" + "1:" + /* consume next item */ + "r1 = %[iter];" + "call %[bpf_iter_num_next];" + "if r0 == 0 goto 2f;" + "goto 1b;" + "2:" + /* destroy iterator */ + "r1 = %[iter];" + "call %[bpf_iter_num_destroy];" + : + : __imm_ptr(iter), ITER_HELPERS + : __clobber_common, "r6" + ); +} + +SEC("socket") +__success +__naked int widening_stack_size_bug(void *ctx) +{ + /* + * Depending on iterator_with_diff_stack_depth() parameter value, + * subprogram stack depth is either 8 or 128 bytes. Arrange values so + * that it is 128 on a first call and 8 on a second. This triggered a + * bug in verifier's widen_imprecise_scalars() logic. + */ + asm volatile ( + "r6 = 0;" + "r1 = 0;" + "1:" + "call iterator_with_diff_stack_depth;" + "r1 = 42;" + "r6 += 1;" + "if r6 < 2 goto 1b;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} diff --git a/tools/testing/selftests/bpf/progs/livepatch_trampoline.c b/tools/testing/selftests/bpf/progs/livepatch_trampoline.c new file mode 100644 index 000000000000..15579d5bcd91 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/livepatch_trampoline.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +int fentry_hit; +int fexit_hit; +int my_pid; + +SEC("fentry/cmdline_proc_show") +int BPF_PROG(fentry_cmdline) +{ + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + fentry_hit = 1; + return 0; +} + +SEC("fexit/cmdline_proc_show") +int BPF_PROG(fexit_cmdline) +{ + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + fexit_hit = 1; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/mptcp_sockmap.c b/tools/testing/selftests/bpf/progs/mptcp_sockmap.c new file mode 100644 index 000000000000..d4eef0cbadb9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/mptcp_sockmap.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bpf_tracing_net.h" + +char _license[] SEC("license") = "GPL"; + +int sk_index; +int redirect_idx; +int trace_port; +int helper_ret; +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, 100); +} sock_map SEC(".maps"); + +SEC("sockops") +int mptcp_sockmap_inject(struct bpf_sock_ops *skops) +{ + struct bpf_sock *sk; + + /* only accept specified connection */ + if (skops->local_port != trace_port || + skops->op != BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) + return 1; + + sk = skops->sk; + if (!sk) + return 1; + + /* update sk handler */ + helper_ret = bpf_sock_map_update(skops, &sock_map, &sk_index, BPF_NOEXIST); + + return 1; +} + +SEC("sk_skb/stream_verdict") +int mptcp_sockmap_redirect(struct __sk_buff *skb) +{ + /* redirect skb to the sk under sock_map[redirect_idx] */ + return bpf_sk_redirect_map(skb, &sock_map, redirect_idx, 0); +} diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c new file mode 100644 index 000000000000..a96c8150d7f5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#ifndef PERF_MAX_STACK_DEPTH +#define PERF_MAX_STACK_DEPTH 127 +#endif + +typedef __u64 stack_trace_t[PERF_MAX_STACK_DEPTH]; + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(max_entries, 16384); + __type(key, __u32); + __type(value, stack_trace_t); +} stackmap SEC(".maps"); + +extern bool CONFIG_UNWINDER_ORC __kconfig __weak; + +/* + * This function is here to have CONFIG_UNWINDER_ORC + * used and added to object BTF. + */ +int unused(void) +{ + return CONFIG_UNWINDER_ORC ? 0 : 1; +} + +__u32 stack_key; + +SEC("kprobe.multi") +int kprobe_multi_test(struct pt_regs *ctx) +{ + stack_key = bpf_get_stackid(ctx, &stackmap, 0); + return 0; +} + +SEC("raw_tp/bpf_testmod_test_read") +int rawtp_test(void *ctx) +{ + /* Skip ebpf program entry in the stack. */ + stack_key = bpf_get_stackid(ctx, &stackmap, 0); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c index b4a0d0cc8ec8..3662515f0107 100644 --- a/tools/testing/selftests/bpf/progs/stream_fail.c +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -10,7 +10,7 @@ SEC("syscall") __failure __msg("Possibly NULL pointer passed") int stream_vprintk_null_arg(void *ctx) { - bpf_stream_vprintk(BPF_STDOUT, "", NULL, 0, NULL); + bpf_stream_vprintk_impl(BPF_STDOUT, "", NULL, 0, NULL); return 0; } @@ -18,7 +18,7 @@ SEC("syscall") __failure __msg("R3 type=scalar expected=") int stream_vprintk_scalar_arg(void *ctx) { - bpf_stream_vprintk(BPF_STDOUT, "", (void *)46, 0, NULL); + bpf_stream_vprintk_impl(BPF_STDOUT, "", (void *)46, 0, NULL); return 0; } @@ -26,7 +26,7 @@ SEC("syscall") __failure __msg("arg#1 doesn't point to a const string") int stream_vprintk_string_arg(void *ctx) { - bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0, NULL); + bpf_stream_vprintk_impl(BPF_STDOUT, ctx, NULL, 0, NULL); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c index 23217f06a3ec..663a80990f8f 100644 --- a/tools/testing/selftests/bpf/progs/task_work.c +++ b/tools/testing/selftests/bpf/progs/task_work.c @@ -66,7 +66,7 @@ int oncpu_hash_map(struct pt_regs *args) if (!work) return 0; - bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); return 0; } @@ -80,7 +80,7 @@ int oncpu_array_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work, NULL); + bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, process_work, NULL); return 0; } @@ -102,6 +102,6 @@ int oncpu_lru_map(struct pt_regs *args) work = bpf_map_lookup_elem(&lrumap, &key); if (!work || work->data[0]) return 0; - bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, &lrumap, process_work, NULL); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c index 77fe8f28facd..1270953fd092 100644 --- a/tools/testing/selftests/bpf/progs/task_work_fail.c +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -53,7 +53,7 @@ int mismatch_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); return 0; } @@ -65,7 +65,7 @@ int no_map_task_work(struct pt_regs *args) struct bpf_task_work tw; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume(task, &tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &tw, &hmap, process_work, NULL); return 0; } @@ -76,7 +76,7 @@ int task_work_null(struct pt_regs *args) struct task_struct *task; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume(task, NULL, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, NULL, &hmap, process_work, NULL); return 0; } @@ -91,6 +91,6 @@ int map_null(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, NULL, process_work, NULL); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c index 90fca06fff56..55e555f7f41b 100644 --- a/tools/testing/selftests/bpf/progs/task_work_stress.c +++ b/tools/testing/selftests/bpf/progs/task_work_stress.c @@ -51,8 +51,8 @@ int schedule_task_work(void *ctx) if (!work) return 0; } - err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, - process_work, NULL); + err = bpf_task_work_schedule_signal_impl(bpf_get_current_task_btf(), &work->tw, &hmap, + process_work, NULL); if (err) __sync_fetch_and_add(&schedule_error, 1); else diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 8074bc5f6f20..ed0a4721d8fd 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -417,6 +417,30 @@ noinline int bpf_testmod_fentry_test11(u64 a, void *b, short c, int d, return a + (long)b + c + d + (long)e + f + g + h + i + j + k; } +noinline void bpf_testmod_stacktrace_test(void) +{ + /* used for stacktrace test as attach function */ + asm volatile (""); +} + +noinline void bpf_testmod_stacktrace_test_3(void) +{ + bpf_testmod_stacktrace_test(); + asm volatile (""); +} + +noinline void bpf_testmod_stacktrace_test_2(void) +{ + bpf_testmod_stacktrace_test_3(); + asm volatile (""); +} + +noinline void bpf_testmod_stacktrace_test_1(void) +{ + bpf_testmod_stacktrace_test_2(); + asm volatile (""); +} + int bpf_testmod_fentry_ok; noinline ssize_t @@ -497,6 +521,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, 21, 22, 23, 24, 25, 26) != 231) goto out; + bpf_testmod_stacktrace_test_1(); + bpf_testmod_fentry_ok = 1; out: return -EIO; /* always fail */ diff --git a/tools/testing/selftests/cachestat/.gitignore b/tools/testing/selftests/cachestat/.gitignore index d6c30b43a4bb..abbb13b6e96b 100644 --- a/tools/testing/selftests/cachestat/.gitignore +++ b/tools/testing/selftests/cachestat/.gitignore @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only test_cachestat +tmpshmcstat diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c index c952640f163b..ab838bcb9ec5 100644 --- a/tools/testing/selftests/cachestat/test_cachestat.c +++ b/tools/testing/selftests/cachestat/test_cachestat.c @@ -226,7 +226,7 @@ bool run_cachestat_test(enum file_type type) int syscall_ret; size_t compute_len = PS * 512; struct cachestat_range cs_range = { PS, compute_len }; - char *filename = "tmpshmcstat"; + char *filename = "tmpshmcstat", *map; struct cachestat cs; bool ret = true; int fd; @@ -257,7 +257,7 @@ bool run_cachestat_test(enum file_type type) } break; case FILE_MMAP: - char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE, + map = mmap(NULL, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (map == MAP_FAILED) { diff --git a/tools/testing/selftests/coredump/.gitignore b/tools/testing/selftests/coredump/.gitignore new file mode 100644 index 000000000000..097f52db0be9 --- /dev/null +++ b/tools/testing/selftests/coredump/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +stackdump_test +coredump_socket_test +coredump_socket_protocol_test diff --git a/tools/testing/selftests/coredump/Makefile b/tools/testing/selftests/coredump/Makefile index 77b3665c73c7..dece1a31d561 100644 --- a/tools/testing/selftests/coredump/Makefile +++ b/tools/testing/selftests/coredump/Makefile @@ -1,7 +1,13 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -TEST_GEN_PROGS := stackdump_test +TEST_GEN_PROGS := stackdump_test \ + coredump_socket_test \ + coredump_socket_protocol_test TEST_FILES := stackdump include ../lib.mk + +$(OUTPUT)/stackdump_test: coredump_test_helpers.c +$(OUTPUT)/coredump_socket_test: coredump_test_helpers.c +$(OUTPUT)/coredump_socket_protocol_test: coredump_test_helpers.c diff --git a/tools/testing/selftests/coredump/coredump_socket_protocol_test.c b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c new file mode 100644 index 000000000000..d19b6717c53e --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c @@ -0,0 +1,1568 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/stat.h> +#include <sys/epoll.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include "coredump_test.h" + +#define NUM_CRASHING_COREDUMPS 5 + +FIXTURE_SETUP(coredump) +{ + FILE *file; + int ret; + + self->pid_coredump_server = -ESRCH; + self->fd_tmpfs_detached = -1; + file = fopen("/proc/sys/kernel/core_pattern", "r"); + ASSERT_NE(NULL, file); + + ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file); + ASSERT_TRUE(ret || feof(file)); + ASSERT_LT(ret, sizeof(self->original_core_pattern)); + + self->original_core_pattern[ret] = '\0'; + self->fd_tmpfs_detached = create_detached_tmpfs(); + ASSERT_GE(self->fd_tmpfs_detached, 0); + + ret = fclose(file); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(coredump) +{ + const char *reason; + FILE *file; + int ret, status; + + if (self->pid_coredump_server > 0) { + kill(self->pid_coredump_server, SIGTERM); + waitpid(self->pid_coredump_server, &status, 0); + } + unlink("/tmp/coredump.file"); + unlink("/tmp/coredump.socket"); + + file = fopen("/proc/sys/kernel/core_pattern", "w"); + if (!file) { + reason = "Unable to open core_pattern"; + goto fail; + } + + ret = fprintf(file, "%s", self->original_core_pattern); + if (ret < 0) { + reason = "Unable to write to core_pattern"; + goto fail; + } + + ret = fclose(file); + if (ret) { + reason = "Unable to close core_pattern"; + goto fail; + } + + if (self->fd_tmpfs_detached >= 0) { + ret = close(self->fd_tmpfs_detached); + if (ret < 0) { + reason = "Unable to close detached tmpfs"; + goto fail; + } + self->fd_tmpfs_detached = -1; + } + + return; +fail: + /* This should never happen */ + fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason); +} + +TEST_F(coredump, socket_request_kernel) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct stat st; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_kernel: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_kernel: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_kernel: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_kernel: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_kernel: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_kernel: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_kernel: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + fd_core_file = creat("/tmp/coredump.file", 0644); + if (fd_core_file < 0) { + fprintf(stderr, "socket_request_kernel: creat coredump file failed: %m\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_kernel: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_kernel: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_kernel: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_request_kernel: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket_request_kernel: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "socket_request_kernel: write to core file failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_kernel: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); + + ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); + ASSERT_GT(st.st_size, 0); + system("file /tmp/coredump.file"); +} + +TEST_F(coredump, socket_request_userspace) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_userspace: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_userspace: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_userspace: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_userspace: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_userspace: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_userspace: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_userspace: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_userspace: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_userspace: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_USERSPACE | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_userspace: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_request_userspace: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read > 0) { + fprintf(stderr, "socket_request_userspace: unexpected data received (expected no coredump data)\n"); + goto out; + } + + if (bytes_read < 0) { + fprintf(stderr, "socket_request_userspace: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_userspace: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_reject) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_reject: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_reject: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_reject: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_reject: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_reject: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_reject: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_reject: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_reject: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_reject: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_reject: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_request_reject: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read > 0) { + fprintf(stderr, "socket_request_reject: unexpected data received (expected no coredump data for REJECT)\n"); + goto out; + } + + if (bytes_read < 0) { + fprintf(stderr, "socket_request_reject: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_reject: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_flag_combination) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_invalid_flag_combination: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_invalid_flag_combination: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_invalid_flag_combination: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_invalid_flag_combination: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING)) { + fprintf(stderr, "socket_request_invalid_flag_combination: read_marker COREDUMP_MARK_CONFLICTING failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_invalid_flag_combination: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_unknown_flag) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_unknown_flag: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_unknown_flag: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_unknown_flag: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_unknown_flag: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_unknown_flag: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_unknown_flag: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_unknown_flag: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_unknown_flag: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_unknown_flag: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0)) { + fprintf(stderr, "socket_request_unknown_flag: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED)) { + fprintf(stderr, "socket_request_unknown_flag: read_marker COREDUMP_MARK_UNSUPPORTED failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_unknown_flag: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_size_small) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_invalid_size_small: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_invalid_size_small: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_invalid_size_small: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_invalid_size_small: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_invalid_size_small: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_invalid_size_small: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_invalid_size_small: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_invalid_size_small: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_invalid_size_small: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, + COREDUMP_ACK_SIZE_VER0 / 2)) { + fprintf(stderr, "socket_request_invalid_size_small: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE)) { + fprintf(stderr, "socket_request_invalid_size_small: read_marker COREDUMP_MARK_MINSIZE failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_invalid_size_small: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_size_large) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_invalid_size_large: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_invalid_size_large: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_invalid_size_large: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_invalid_size_large: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_invalid_size_large: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_invalid_size_large: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_invalid_size_large: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_invalid_size_large: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_invalid_size_large: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, + COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE)) { + fprintf(stderr, "socket_request_invalid_size_large: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE)) { + fprintf(stderr, "socket_request_invalid_size_large: read_marker COREDUMP_MARK_MAXSIZE failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_invalid_size_large: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGSEGV + * + * Verify that when using socket-based coredump protocol, + * the coredump_signal field is correctly exposed as SIGSEGV. + */ +TEST_F(coredump, socket_coredump_signal_sigsegv) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGSEGV) { + fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n", + info.coredump_signal, SIGSEGV); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: read_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGSEGV); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGSEGV); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGABRT + * + * Verify that when using socket-based coredump protocol, + * the coredump_signal field is correctly exposed as SIGABRT. + */ +TEST_F(coredump, socket_coredump_signal_sigabrt) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGABRT) { + fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n", + info.coredump_signal, SIGABRT); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: read_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + abort(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGABRT); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGABRT); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500) +{ + int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; + pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + struct coredump_req req = {}; + + close(ipc_sockets[0]); + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "Failed to create and listen on unix socket\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "Failed to notify parent via ipc socket\n"); + goto out; + } + close(ipc_sockets[1]); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd); + goto out; + } + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { + fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump); + goto out; + } + + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file); + goto out; + } + } + + close(fd_core_file); + close(fd_peer_pidfd); + close(fd_coredump); + fd_peer_pidfd = -1; + fd_coredump = -1; + } + + exit_code = EXIT_SUCCESS; +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + pid[i] = fork(); + ASSERT_GE(pid[i], 0); + if (pid[i] == 0) + crashing_child(); + pidfd[i] = sys_pidfd_open(pid[i], 0); + ASSERT_GE(pidfd[i], 0); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + waitpid(pid[i], &status[i], 0); + ASSERT_TRUE(WIFSIGNALED(status[i])); + ASSERT_TRUE(WCOREDUMP(status[i])); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; + ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + } + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500) +{ + int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; + pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS]; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0; + fd_server = -1; + exit_code = EXIT_FAILURE; + n_conns = 0; + close(ipc_sockets[0]); + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: write_nointr to ipc socket failed: %m\n"); + goto out; + } + close(ipc_sockets[1]); + + while (n_conns < NUM_CRASHING_COREDUMPS) { + int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + struct coredump_req req = {}; + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + continue; + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: accept4 failed: %m\n"); + goto out; + } + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_peer_pidfd failed\n"); + goto out; + } + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_pidfd_info failed\n"); + goto out; + } + if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: missing PIDFD_INFO_COREDUMP or PIDFD_COREDUMPED\n"); + goto out; + } + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_coredump_req failed\n"); + goto out; + } + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: check_coredump_req failed\n"); + goto out; + } + if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: send_coredump_ack failed\n"); + goto out; + } + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_marker failed\n"); + goto out; + } + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: open_coredump_tmpfile failed: %m\n"); + goto out; + } + pid_t worker = fork(); + if (worker == 0) { + close(fd_server); + process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file); + } + worker_pids[n_conns] = worker; + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_core_file >= 0) + close(fd_core_file); + n_conns++; + } + exit_code = EXIT_SUCCESS; +out: + if (fd_server >= 0) + close(fd_server); + + // Reap all worker processes + for (int i = 0; i < n_conns; i++) { + int wstatus; + if (waitpid(worker_pids[i], &wstatus, 0) < 0) { + fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]); + } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) { + fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus)); + exit_code = EXIT_FAILURE; + } + } + + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + pid[i] = fork(); + ASSERT_GE(pid[i], 0); + if (pid[i] == 0) + crashing_child(); + pidfd[i] = sys_pidfd_open(pid[i], 0); + ASSERT_GE(pidfd[i], 0); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + ASSERT_GE(waitpid(pid[i], &status[i], 0), 0); + ASSERT_TRUE(WIFSIGNALED(status[i])); + ASSERT_TRUE(WCOREDUMP(status[i])); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; + ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + } + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/coredump/coredump_socket_test.c b/tools/testing/selftests/coredump/coredump_socket_test.c new file mode 100644 index 000000000000..7e26d4a6a15d --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_socket_test.c @@ -0,0 +1,742 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/stat.h> +#include <sys/epoll.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include "coredump_test.h" + +FIXTURE_SETUP(coredump) +{ + FILE *file; + int ret; + + self->pid_coredump_server = -ESRCH; + self->fd_tmpfs_detached = -1; + file = fopen("/proc/sys/kernel/core_pattern", "r"); + ASSERT_NE(NULL, file); + + ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file); + ASSERT_TRUE(ret || feof(file)); + ASSERT_LT(ret, sizeof(self->original_core_pattern)); + + self->original_core_pattern[ret] = '\0'; + self->fd_tmpfs_detached = create_detached_tmpfs(); + ASSERT_GE(self->fd_tmpfs_detached, 0); + + ret = fclose(file); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(coredump) +{ + const char *reason; + FILE *file; + int ret, status; + + if (self->pid_coredump_server > 0) { + kill(self->pid_coredump_server, SIGTERM); + waitpid(self->pid_coredump_server, &status, 0); + } + unlink("/tmp/coredump.file"); + unlink("/tmp/coredump.socket"); + + file = fopen("/proc/sys/kernel/core_pattern", "w"); + if (!file) { + reason = "Unable to open core_pattern"; + goto fail; + } + + ret = fprintf(file, "%s", self->original_core_pattern); + if (ret < 0) { + reason = "Unable to write to core_pattern"; + goto fail; + } + + ret = fclose(file); + if (ret) { + reason = "Unable to close core_pattern"; + goto fail; + } + + if (self->fd_tmpfs_detached >= 0) { + ret = close(self->fd_tmpfs_detached); + if (ret < 0) { + reason = "Unable to close detached tmpfs"; + goto fail; + } + self->fd_tmpfs_detached = -1; + } + + return; +fail: + /* This should never happen */ + fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason); +} + +TEST_F(coredump, socket) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct stat st; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket test: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket test: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket test: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket test: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket test: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket test: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket test: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + fd_core_file = creat("/tmp/coredump.file", 0644); + if (fd_core_file < 0) { + fprintf(stderr, "socket test: creat coredump file failed: %m\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket test: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "socket test: write to core file failed (read=%zd, write=%zd): %m\n", bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket test: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); + + ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); + ASSERT_GT(st.st_size, 0); +} + +TEST_F(coredump, socket_detect_userspace_client) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct stat st; + struct pidfd_info info = { + .mask = PIDFD_INFO_COREDUMP, + }; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_detect_userspace_client: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_detect_userspace_client: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_detect_userspace_client: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_detect_userspace_client: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_detect_userspace_client: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_detect_userspace_client: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (info.coredump_mask & PIDFD_COREDUMPED) { + fprintf(stderr, "socket_detect_userspace_client: PIDFD_COREDUMPED incorrectly set (should be userspace client)\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_detect_userspace_client: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) { + int fd_socket; + ssize_t ret; + const struct sockaddr_un coredump_sk = { + .sun_family = AF_UNIX, + .sun_path = "/tmp/coredump.socket", + }; + size_t coredump_sk_len = + offsetof(struct sockaddr_un, sun_path) + + sizeof("/tmp/coredump.socket"); + + fd_socket = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd_socket < 0) { + fprintf(stderr, "socket_detect_userspace_client (client): socket failed: %m\n"); + _exit(EXIT_FAILURE); + } + + ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len); + if (ret < 0) { + fprintf(stderr, "socket_detect_userspace_client (client): connect failed: %m\n"); + _exit(EXIT_FAILURE); + } + + close(fd_socket); + pause(); + fprintf(stderr, "socket_detect_userspace_client (client): completed successfully\n"); + _exit(EXIT_SUCCESS); + } + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); + + ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0); + ASSERT_EQ(close(pidfd), 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + ASSERT_NE(stat("/tmp/coredump.file", &st), 0); + ASSERT_EQ(errno, ENOENT); +} + +TEST_F(coredump, socket_enoent) +{ + int pidfd, status; + pid_t pid; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); +} + +TEST_F(coredump, socket_no_listener) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + int ipc_sockets[2]; + char c; + const struct sockaddr_un coredump_sk = { + .sun_family = AF_UNIX, + .sun_path = "/tmp/coredump.socket", + }; + size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) + + sizeof("/tmp/coredump.socket"); + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (fd_server < 0) { + fprintf(stderr, "socket_no_listener: socket failed: %m\n"); + goto out; + } + + ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len); + if (ret < 0) { + fprintf(stderr, "socket_no_listener: bind failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_no_listener: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_no_listener: completed successfully\n"); +out: + if (fd_server >= 0) + close(fd_server); + close(ipc_sockets[1]); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump + * + * Verify that when using simple socket-based coredump (@ pattern), + * the coredump_signal field is correctly exposed as SIGSEGV. + */ +TEST_F(coredump, socket_coredump_signal_sigsegv) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGSEGV) { + fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n", + info.coredump_signal, SIGSEGV); + goto out; + } + + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: open_coredump_tmpfile failed: %m\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + fprintf(stderr, "socket_coredump_signal_sigsegv: write to core file failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGSEGV); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGSEGV); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump with SIGABRT + * + * Verify that when using simple socket-based coredump (@ pattern), + * the coredump_signal field is correctly exposed as SIGABRT. + */ +TEST_F(coredump, socket_coredump_signal_sigabrt) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGABRT) { + fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n", + info.coredump_signal, SIGABRT); + goto out; + } + + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: open_coredump_tmpfile failed: %m\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + fprintf(stderr, "socket_coredump_signal_sigabrt: write to core file failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + abort(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGABRT); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGABRT); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_invalid_paths) +{ + ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket")); + ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/..")); + ASSERT_FALSE(set_core_pattern("@..")); + + ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/..")); + ASSERT_FALSE(set_core_pattern("@@..")); + + ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket")); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/coredump/coredump_test.h b/tools/testing/selftests/coredump/coredump_test.h new file mode 100644 index 000000000000..ed47f01fa53c --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_test.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __COREDUMP_TEST_H +#define __COREDUMP_TEST_H + +#include <stdbool.h> +#include <sys/types.h> +#include <linux/coredump.h> + +#include "../kselftest_harness.h" +#include "../pidfd/pidfd.h" + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#define NUM_THREAD_SPAWN 128 + +/* Coredump fixture */ +FIXTURE(coredump) +{ + char original_core_pattern[256]; + pid_t pid_coredump_server; + int fd_tmpfs_detached; +}; + +/* Shared helper function declarations */ +void *do_nothing(void *arg); +void crashing_child(void); +int create_detached_tmpfs(void); +int create_and_listen_unix_socket(const char *path); +bool set_core_pattern(const char *pattern); +int get_peer_pidfd(int fd); +bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info); + +/* Inline helper that uses harness types */ +static inline void wait_and_check_coredump_server(pid_t pid_coredump_server, + struct __test_metadata *const _metadata, + FIXTURE_DATA(coredump) *self) +{ + int status; + waitpid(pid_coredump_server, &status, 0); + self->pid_coredump_server = -ESRCH; + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); +} + +/* Protocol helper function declarations */ +ssize_t recv_marker(int fd); +bool read_marker(int fd, enum coredump_mark mark); +bool read_coredump_req(int fd, struct coredump_req *req); +bool send_coredump_ack(int fd, const struct coredump_req *req, + __u64 mask, size_t size_ack); +bool check_coredump_req(const struct coredump_req *req, size_t min_size, + __u64 required_mask); +int open_coredump_tmpfile(int fd_tmpfs_detached); +void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file); + +#endif /* __COREDUMP_TEST_H */ diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c new file mode 100644 index 000000000000..a6f6d5f2ae07 --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_test_helpers.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/coredump.h> +#include <linux/fs.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/epoll.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/un.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "../filesystems/wrappers.h" +#include "../pidfd/pidfd.h" + +/* Forward declarations to avoid including harness header */ +struct __test_metadata; + +/* Match the fixture definition from coredump_test.h */ +struct _fixture_coredump_data { + char original_core_pattern[256]; + pid_t pid_coredump_server; + int fd_tmpfs_detached; +}; + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#define NUM_THREAD_SPAWN 128 + +void *do_nothing(void *arg) +{ + (void)arg; + while (1) + pause(); + + return NULL; +} + +void crashing_child(void) +{ + pthread_t thread; + int i; + + for (i = 0; i < NUM_THREAD_SPAWN; ++i) + pthread_create(&thread, NULL, do_nothing, NULL); + + /* crash on purpose */ + i = *(int *)NULL; +} + +int create_detached_tmpfs(void) +{ + int fd_context, fd_tmpfs; + + fd_context = sys_fsopen("tmpfs", 0); + if (fd_context < 0) + return -1; + + if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) + return -1; + + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + close(fd_context); + return fd_tmpfs; +} + +int create_and_listen_unix_socket(const char *path) +{ + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + assert(strlen(path) < sizeof(addr.sun_path) - 1); + strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); + size_t addr_len = + offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1; + int fd, ret; + + fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (fd < 0) + goto out; + + ret = bind(fd, (const struct sockaddr *)&addr, addr_len); + if (ret < 0) + goto out; + + ret = listen(fd, 128); + if (ret < 0) + goto out; + + return fd; + +out: + if (fd >= 0) + close(fd); + return -1; +} + +bool set_core_pattern(const char *pattern) +{ + int fd; + ssize_t ret; + + fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC); + if (fd < 0) + return false; + + ret = write(fd, pattern, strlen(pattern)); + close(fd); + if (ret < 0) + return false; + + fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern)); + return ret == strlen(pattern); +} + +int get_peer_pidfd(int fd) +{ + int fd_peer_pidfd; + socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd); + int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd, + &fd_peer_pidfd_len); + if (ret < 0) { + fprintf(stderr, "get_peer_pidfd: getsockopt(SO_PEERPIDFD) failed: %m\n"); + return -1; + } + fprintf(stderr, "get_peer_pidfd: successfully retrieved pidfd %d\n", fd_peer_pidfd); + return fd_peer_pidfd; +} + +bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info) +{ + int ret; + memset(info, 0, sizeof(*info)); + info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL; + ret = ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info); + if (ret < 0) { + fprintf(stderr, "get_pidfd_info: ioctl(PIDFD_GET_INFO) failed: %m\n"); + return false; + } + fprintf(stderr, "get_pidfd_info: mask=0x%llx, coredump_mask=0x%x, coredump_signal=%d\n", + (unsigned long long)info->mask, info->coredump_mask, info->coredump_signal); + return true; +} + +/* Protocol helper functions */ + +ssize_t recv_marker(int fd) +{ + enum coredump_mark mark = COREDUMP_MARK_REQACK; + ssize_t ret; + + ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL); + if (ret != sizeof(mark)) + return -1; + + switch (mark) { + case COREDUMP_MARK_REQACK: + fprintf(stderr, "Received marker: ReqAck\n"); + return COREDUMP_MARK_REQACK; + case COREDUMP_MARK_MINSIZE: + fprintf(stderr, "Received marker: MinSize\n"); + return COREDUMP_MARK_MINSIZE; + case COREDUMP_MARK_MAXSIZE: + fprintf(stderr, "Received marker: MaxSize\n"); + return COREDUMP_MARK_MAXSIZE; + case COREDUMP_MARK_UNSUPPORTED: + fprintf(stderr, "Received marker: Unsupported\n"); + return COREDUMP_MARK_UNSUPPORTED; + case COREDUMP_MARK_CONFLICTING: + fprintf(stderr, "Received marker: Conflicting\n"); + return COREDUMP_MARK_CONFLICTING; + default: + fprintf(stderr, "Received unknown marker: %u\n", mark); + break; + } + return -1; +} + +bool read_marker(int fd, enum coredump_mark mark) +{ + ssize_t ret; + + ret = recv_marker(fd); + if (ret < 0) + return false; + return ret == mark; +} + +bool read_coredump_req(int fd, struct coredump_req *req) +{ + ssize_t ret; + size_t field_size, user_size, ack_size, kernel_size, remaining_size; + + memset(req, 0, sizeof(*req)); + field_size = sizeof(req->size); + + /* Peek the size of the coredump request. */ + ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL); + if (ret != field_size) { + fprintf(stderr, "read_coredump_req: peek failed (got %zd, expected %zu): %m\n", + ret, field_size); + return false; + } + kernel_size = req->size; + + if (kernel_size < COREDUMP_ACK_SIZE_VER0) { + fprintf(stderr, "read_coredump_req: kernel_size %zu < min %d\n", + kernel_size, COREDUMP_ACK_SIZE_VER0); + return false; + } + if (kernel_size >= PAGE_SIZE) { + fprintf(stderr, "read_coredump_req: kernel_size %zu >= PAGE_SIZE %d\n", + kernel_size, PAGE_SIZE); + return false; + } + + /* Use the minimum of user and kernel size to read the full request. */ + user_size = sizeof(struct coredump_req); + ack_size = user_size < kernel_size ? user_size : kernel_size; + ret = recv(fd, req, ack_size, MSG_WAITALL); + if (ret != ack_size) + return false; + + fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n", + req->size, (unsigned long long)req->mask); + + if (user_size > kernel_size) + remaining_size = user_size - kernel_size; + else + remaining_size = kernel_size - user_size; + + if (PAGE_SIZE <= remaining_size) + return false; + + /* + * Discard any additional data if the kernel's request was larger than + * what we knew about or cared about. + */ + if (remaining_size) { + char buffer[PAGE_SIZE]; + + ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL); + if (ret != remaining_size) + return false; + fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size); + } + + return true; +} + +bool send_coredump_ack(int fd, const struct coredump_req *req, + __u64 mask, size_t size_ack) +{ + ssize_t ret; + /* + * Wrap struct coredump_ack in a larger struct so we can + * simulate sending to much data to the kernel. + */ + struct large_ack_for_size_testing { + struct coredump_ack ack; + char buffer[PAGE_SIZE]; + } large_ack = {}; + + if (!size_ack) + size_ack = sizeof(struct coredump_ack) < req->size_ack ? + sizeof(struct coredump_ack) : + req->size_ack; + large_ack.ack.mask = mask; + large_ack.ack.size = size_ack; + ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL); + if (ret != size_ack) + return false; + + fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n", + size_ack, (unsigned long long)mask); + return true; +} + +bool check_coredump_req(const struct coredump_req *req, size_t min_size, + __u64 required_mask) +{ + if (req->size < min_size) + return false; + if ((req->mask & required_mask) != required_mask) + return false; + if (req->mask & ~required_mask) + return false; + return true; +} + +int open_coredump_tmpfile(int fd_tmpfs_detached) +{ + return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600); +} + +void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file) +{ + int epfd = -1; + int exit_code = EXIT_FAILURE; + struct epoll_event ev; + int flags; + + /* Set socket to non-blocking mode for edge-triggered epoll */ + flags = fcntl(fd_coredump, F_GETFL, 0); + if (flags < 0) { + fprintf(stderr, "Worker: fcntl(F_GETFL) failed: %m\n"); + goto out; + } + if (fcntl(fd_coredump, F_SETFL, flags | O_NONBLOCK) < 0) { + fprintf(stderr, "Worker: fcntl(F_SETFL, O_NONBLOCK) failed: %m\n"); + goto out; + } + + epfd = epoll_create1(0); + if (epfd < 0) { + fprintf(stderr, "Worker: epoll_create1() failed: %m\n"); + goto out; + } + + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; + ev.data.fd = fd_coredump; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0) { + fprintf(stderr, "Worker: epoll_ctl(EPOLL_CTL_ADD) failed: %m\n"); + goto out; + } + + for (;;) { + struct epoll_event events[1]; + int n = epoll_wait(epfd, events, 1, -1); + if (n < 0) { + fprintf(stderr, "Worker: epoll_wait() failed: %m\n"); + break; + } + + if (events[0].events & (EPOLLIN | EPOLLRDHUP)) { + for (;;) { + char buffer[4096]; + ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + break; + fprintf(stderr, "Worker: read() failed: %m\n"); + goto out; + } + if (bytes_read == 0) + goto done; + ssize_t bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_write != bytes_read) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "Worker: write() failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + } + } + +done: + exit_code = EXIT_SUCCESS; + fprintf(stderr, "Worker: completed successfully\n"); +out: + if (epfd >= 0) + close(epfd); + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + _exit(exit_code); +} diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index a4ac80bb1003..c2e895bcc160 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -23,57 +23,15 @@ #include "../filesystems/wrappers.h" #include "../pidfd/pidfd.h" +#include "coredump_test.h" + #define STACKDUMP_FILE "stack_values" #define STACKDUMP_SCRIPT "stackdump" -#define NUM_THREAD_SPAWN 128 #ifndef PAGE_SIZE #define PAGE_SIZE 4096 #endif -static void *do_nothing(void *) -{ - while (1) - pause(); - - return NULL; -} - -static void crashing_child(void) -{ - pthread_t thread; - int i; - - for (i = 0; i < NUM_THREAD_SPAWN; ++i) - pthread_create(&thread, NULL, do_nothing, NULL); - - /* crash on purpose */ - i = *(int *)NULL; -} - -FIXTURE(coredump) -{ - char original_core_pattern[256]; - pid_t pid_coredump_server; - int fd_tmpfs_detached; -}; - -static int create_detached_tmpfs(void) -{ - int fd_context, fd_tmpfs; - - fd_context = sys_fsopen("tmpfs", 0); - if (fd_context < 0) - return -1; - - if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) - return -1; - - fd_tmpfs = sys_fsmount(fd_context, 0, 0); - close(fd_context); - return fd_tmpfs; -} - FIXTURE_SETUP(coredump) { FILE *file; @@ -208,1620 +166,4 @@ TEST_F_TIMEOUT(coredump, stackdump, 120) fclose(file); } -static int create_and_listen_unix_socket(const char *path) -{ - struct sockaddr_un addr = { - .sun_family = AF_UNIX, - }; - assert(strlen(path) < sizeof(addr.sun_path) - 1); - strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); - size_t addr_len = - offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1; - int fd, ret; - - fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (fd < 0) - goto out; - - ret = bind(fd, (const struct sockaddr *)&addr, addr_len); - if (ret < 0) - goto out; - - ret = listen(fd, 128); - if (ret < 0) - goto out; - - return fd; - -out: - if (fd >= 0) - close(fd); - return -1; -} - -static bool set_core_pattern(const char *pattern) -{ - int fd; - ssize_t ret; - - fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC); - if (fd < 0) - return false; - - ret = write(fd, pattern, strlen(pattern)); - close(fd); - if (ret < 0) - return false; - - fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern)); - return ret == strlen(pattern); -} - -static int get_peer_pidfd(int fd) -{ - int fd_peer_pidfd; - socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd); - int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd, - &fd_peer_pidfd_len); - if (ret < 0) { - fprintf(stderr, "%m - Failed to retrieve peer pidfd for coredump socket connection\n"); - return -1; - } - return fd_peer_pidfd; -} - -static bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info) -{ - memset(info, 0, sizeof(*info)); - info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - return ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info) == 0; -} - -static void -wait_and_check_coredump_server(pid_t pid_coredump_server, - struct __test_metadata *const _metadata, - FIXTURE_DATA(coredump)* self) -{ - int status; - waitpid(pid_coredump_server, &status, 0); - self->pid_coredump_server = -ESRCH; - ASSERT_TRUE(WIFEXITED(status)); - ASSERT_EQ(WEXITSTATUS(status), 0); -} - -TEST_F(coredump, socket) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct stat st; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - fd_core_file = creat("/tmp/coredump.file", 0644); - if (fd_core_file < 0) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read, bytes_write; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - - bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_read != bytes_write) - goto out; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_TRUE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); - - ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); - ASSERT_GT(st.st_size, 0); - system("file /tmp/coredump.file"); -} - -TEST_F(coredump, socket_detect_userspace_client) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct stat st; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (info.coredump_mask & PIDFD_COREDUMPED) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) { - int fd_socket; - ssize_t ret; - const struct sockaddr_un coredump_sk = { - .sun_family = AF_UNIX, - .sun_path = "/tmp/coredump.socket", - }; - size_t coredump_sk_len = - offsetof(struct sockaddr_un, sun_path) + - sizeof("/tmp/coredump.socket"); - - fd_socket = socket(AF_UNIX, SOCK_STREAM, 0); - if (fd_socket < 0) - _exit(EXIT_FAILURE); - - ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len); - if (ret < 0) - _exit(EXIT_FAILURE); - - close(fd_socket); - _exit(EXIT_SUCCESS); - } - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFEXITED(status)); - ASSERT_EQ(WEXITSTATUS(status), 0); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); - - ASSERT_NE(stat("/tmp/coredump.file", &st), 0); - ASSERT_EQ(errno, ENOENT); -} - -TEST_F(coredump, socket_enoent) -{ - int pidfd, status; - pid_t pid; - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); -} - -TEST_F(coredump, socket_no_listener) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - int ipc_sockets[2]; - char c; - const struct sockaddr_un coredump_sk = { - .sun_family = AF_UNIX, - .sun_path = "/tmp/coredump.socket", - }; - size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) + - sizeof("/tmp/coredump.socket"); - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (fd_server < 0) - goto out; - - ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len); - if (ret < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_server >= 0) - close(fd_server); - close(ipc_sockets[1]); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -static ssize_t recv_marker(int fd) -{ - enum coredump_mark mark = COREDUMP_MARK_REQACK; - ssize_t ret; - - ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL); - if (ret != sizeof(mark)) - return -1; - - switch (mark) { - case COREDUMP_MARK_REQACK: - fprintf(stderr, "Received marker: ReqAck\n"); - return COREDUMP_MARK_REQACK; - case COREDUMP_MARK_MINSIZE: - fprintf(stderr, "Received marker: MinSize\n"); - return COREDUMP_MARK_MINSIZE; - case COREDUMP_MARK_MAXSIZE: - fprintf(stderr, "Received marker: MaxSize\n"); - return COREDUMP_MARK_MAXSIZE; - case COREDUMP_MARK_UNSUPPORTED: - fprintf(stderr, "Received marker: Unsupported\n"); - return COREDUMP_MARK_UNSUPPORTED; - case COREDUMP_MARK_CONFLICTING: - fprintf(stderr, "Received marker: Conflicting\n"); - return COREDUMP_MARK_CONFLICTING; - default: - fprintf(stderr, "Received unknown marker: %u\n", mark); - break; - } - return -1; -} - -static bool read_marker(int fd, enum coredump_mark mark) -{ - ssize_t ret; - - ret = recv_marker(fd); - if (ret < 0) - return false; - return ret == mark; -} - -static bool read_coredump_req(int fd, struct coredump_req *req) -{ - ssize_t ret; - size_t field_size, user_size, ack_size, kernel_size, remaining_size; - - memset(req, 0, sizeof(*req)); - field_size = sizeof(req->size); - - /* Peek the size of the coredump request. */ - ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL); - if (ret != field_size) - return false; - kernel_size = req->size; - - if (kernel_size < COREDUMP_ACK_SIZE_VER0) - return false; - if (kernel_size >= PAGE_SIZE) - return false; - - /* Use the minimum of user and kernel size to read the full request. */ - user_size = sizeof(struct coredump_req); - ack_size = user_size < kernel_size ? user_size : kernel_size; - ret = recv(fd, req, ack_size, MSG_WAITALL); - if (ret != ack_size) - return false; - - fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n", - req->size, (unsigned long long)req->mask); - - if (user_size > kernel_size) - remaining_size = user_size - kernel_size; - else - remaining_size = kernel_size - user_size; - - if (PAGE_SIZE <= remaining_size) - return false; - - /* - * Discard any additional data if the kernel's request was larger than - * what we knew about or cared about. - */ - if (remaining_size) { - char buffer[PAGE_SIZE]; - - ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL); - if (ret != remaining_size) - return false; - fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size); - } - - return true; -} - -static bool send_coredump_ack(int fd, const struct coredump_req *req, - __u64 mask, size_t size_ack) -{ - ssize_t ret; - /* - * Wrap struct coredump_ack in a larger struct so we can - * simulate sending to much data to the kernel. - */ - struct large_ack_for_size_testing { - struct coredump_ack ack; - char buffer[PAGE_SIZE]; - } large_ack = {}; - - if (!size_ack) - size_ack = sizeof(struct coredump_ack) < req->size_ack ? - sizeof(struct coredump_ack) : - req->size_ack; - large_ack.ack.mask = mask; - large_ack.ack.size = size_ack; - ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL); - if (ret != size_ack) - return false; - - fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n", - size_ack, (unsigned long long)mask); - return true; -} - -static bool check_coredump_req(const struct coredump_req *req, size_t min_size, - __u64 required_mask) -{ - if (req->size < min_size) - return false; - if ((req->mask & required_mask) != required_mask) - return false; - if (req->mask & ~required_mask) - return false; - return true; -} - -TEST_F(coredump, socket_request_kernel) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct stat st; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - fd_core_file = creat("/tmp/coredump.file", 0644); - if (fd_core_file < 0) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_KERNEL | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read, bytes_write; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - - bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_read != bytes_write) - goto out; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_TRUE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); - - ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); - ASSERT_GT(st.st_size, 0); - system("file /tmp/coredump.file"); -} - -TEST_F(coredump, socket_request_userspace) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_USERSPACE | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read > 0) - goto out; - - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_TRUE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_reject) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_REJECT | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read > 0) - goto out; - - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_invalid_flag_combination) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_unknown_flag) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_invalid_size_small) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_REJECT | COREDUMP_WAIT, - COREDUMP_ACK_SIZE_VER0 / 2)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_invalid_size_large) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_REJECT | COREDUMP_WAIT, - COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -static int open_coredump_tmpfile(int fd_tmpfs_detached) -{ - return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600); -} - -#define NUM_CRASHING_COREDUMPS 5 - -TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500) -{ - int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; - pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; - int exit_code = EXIT_FAILURE; - struct coredump_req req = {}; - - close(ipc_sockets[0]); - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) { - fprintf(stderr, "Failed to create and listen on unix socket\n"); - goto out; - } - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) { - fprintf(stderr, "Failed to notify parent via ipc socket\n"); - goto out; - } - close(ipc_sockets[1]); - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) { - fprintf(stderr, "accept4 failed: %m\n"); - goto out; - } - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) { - fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump); - goto out; - } - - if (!get_pidfd_info(fd_peer_pidfd, &info)) { - fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd); - goto out; - } - - if (!(info.mask & PIDFD_INFO_COREDUMP)) { - fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd); - goto out; - } - if (!(info.coredump_mask & PIDFD_COREDUMPED)) { - fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd); - goto out; - } - - if (!read_coredump_req(fd_coredump, &req)) { - fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump); - goto out; - } - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) { - fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump); - goto out; - } - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { - fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump); - goto out; - } - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { - fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump); - goto out; - } - - fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); - if (fd_core_file < 0) { - fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump); - goto out; - } - - for (;;) { - char buffer[4096]; - ssize_t bytes_read, bytes_write; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) { - fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump); - goto out; - } - - if (bytes_read == 0) - break; - - bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_read != bytes_write) { - fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file); - goto out; - } - } - - close(fd_core_file); - close(fd_peer_pidfd); - close(fd_coredump); - fd_peer_pidfd = -1; - fd_coredump = -1; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - pid[i] = fork(); - ASSERT_GE(pid[i], 0); - if (pid[i] == 0) - crashing_child(); - pidfd[i] = sys_pidfd_open(pid[i], 0); - ASSERT_GE(pidfd[i], 0); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - waitpid(pid[i], &status[i], 0); - ASSERT_TRUE(WIFSIGNALED(status[i])); - ASSERT_TRUE(WCOREDUMP(status[i])); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - } - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -#define MAX_EVENTS 128 - -static void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file) -{ - int epfd = -1; - int exit_code = EXIT_FAILURE; - - epfd = epoll_create1(0); - if (epfd < 0) - goto out; - - struct epoll_event ev; - ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; - ev.data.fd = fd_coredump; - if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0) - goto out; - - for (;;) { - struct epoll_event events[1]; - int n = epoll_wait(epfd, events, 1, -1); - if (n < 0) - break; - - if (events[0].events & (EPOLLIN | EPOLLRDHUP)) { - for (;;) { - char buffer[4096]; - ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) - break; - goto out; - } - if (bytes_read == 0) - goto done; - ssize_t bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_write != bytes_read) - goto out; - } - } - } - -done: - exit_code = EXIT_SUCCESS; -out: - if (epfd >= 0) - close(epfd); - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - _exit(exit_code); -} - -TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500) -{ - int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; - pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS]; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0; - fd_server = -1; - exit_code = EXIT_FAILURE; - n_conns = 0; - close(ipc_sockets[0]); - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - close(ipc_sockets[1]); - - while (n_conns < NUM_CRASHING_COREDUMPS) { - int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; - struct coredump_req req = {}; - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) - continue; - goto out; - } - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - if (!read_coredump_req(fd_coredump, &req)) - goto out; - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0)) - goto out; - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); - if (fd_core_file < 0) - goto out; - pid_t worker = fork(); - if (worker == 0) { - close(fd_server); - process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file); - } - worker_pids[n_conns] = worker; - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_core_file >= 0) - close(fd_core_file); - n_conns++; - } - exit_code = EXIT_SUCCESS; -out: - if (fd_server >= 0) - close(fd_server); - - // Reap all worker processes - for (int i = 0; i < n_conns; i++) { - int wstatus; - if (waitpid(worker_pids[i], &wstatus, 0) < 0) { - fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]); - } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) { - fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus)); - exit_code = EXIT_FAILURE; - } - } - - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - pid[i] = fork(); - ASSERT_GE(pid[i], 0); - if (pid[i] == 0) - crashing_child(); - pidfd[i] = sys_pidfd_open(pid[i], 0); - ASSERT_GE(pidfd[i], 0); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - ASSERT_GE(waitpid(pid[i], &status[i], 0), 0); - ASSERT_TRUE(WIFSIGNALED(status[i])); - ASSERT_TRUE(WCOREDUMP(status[i])); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - } - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_invalid_paths) -{ - ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket")); - ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/..")); - ASSERT_FALSE(set_core_pattern("@..")); - - ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket")); - ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@@../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/..")); - ASSERT_FALSE(set_core_pattern("@@..")); - - ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket")); -} - TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 6e41635bd55a..71ee69e524d7 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -18,6 +18,7 @@ TEST_PROGS := \ netcons_fragmented_msg.sh \ netcons_overflow.sh \ netcons_sysdata.sh \ + netcons_torture.sh \ netpoll_basic.py \ ping.py \ psp.py \ diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 402d4ee84f2e..6c5c60adb5e8 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -14,6 +14,7 @@ TEST_PROGS := \ dev_addr_lists.sh \ mode-1-recovery-updelay.sh \ mode-2-recovery-updelay.sh \ + netcons_over_bonding.sh \ # end of TEST_PROGS TEST_FILES := \ @@ -24,6 +25,7 @@ TEST_FILES := \ TEST_INCLUDES := \ ../../../net/lib.sh \ + ../lib/sh/lib_netcons.sh \ ../../../net/forwarding/lib.sh \ # end of TEST_INCLUDES diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index 6bb290abd48b..991494376223 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -1,5 +1,6 @@ CONFIG_BONDING=y CONFIG_BRIDGE=y +CONFIG_CONFIGFS_FS=y CONFIG_DUMMY=y CONFIG_INET_ESP=y CONFIG_INET_ESP_OFFLOAD=y @@ -9,6 +10,9 @@ CONFIG_MACVLAN=y CONFIG_NET_ACT_GACT=y CONFIG_NET_CLS_FLOWER=y CONFIG_NET_CLS_MATCHALL=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETCONSOLE_EXTENDED_LOG=y CONFIG_NETDEVSIM=m CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y diff --git a/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh new file mode 100755 index 000000000000..477cc9379500 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh @@ -0,0 +1,361 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 +# +# This selftest exercises trying to have multiple netpoll users at the same +# time. +# +# This selftest has multiple smalls test inside, and the goal is to +# get interfaces with bonding and netconsole in different orders in order +# to catch any possible issue. +# +# The main test composes of four interfaces being created using netdevsim; two +# of them are bonded to serve as the netconsole's transmit interface. The +# remaining two interfaces are similarly bonded and assigned to a separate +# network namespace, which acts as the receive interface, where socat monitors +# for incoming messages. +# +# A netconsole message is then sent to ensure it is properly received across +# this configuration. +# +# Later, run a few other tests, to make sure that bonding and netconsole +# cannot coexist. +# +# The test's objective is to exercise netpoll usage when managed simultaneously +# by multiple subsystems (netconsole and bonding). +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true +modprobe bonding 2> /dev/null || true +modprobe veth 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup_bond EXIT + +FORMAT="extended" +IP_VERSION="ipv4" +VETH0="veth"$(( RANDOM % 256)) +VETH1="veth"$((256 + RANDOM % 256)) +TXNS="" +RXNS="" + +# Create "bond_tx_XX" and "bond_rx_XX" interfaces, and set DSTIF and SRCIF with +# the bonding interfaces +function setup_bonding_ifaces() { + local RAND=$(( RANDOM % 100 )) + BOND_TX_MAIN_IF="bond_tx_$RAND" + BOND_RX_MAIN_IF="bond_rx_$RAND" + + # Setup TX + if ! ip -n "${TXNS}" link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr + then + echo "Failed to create bond TX interface. Is CONFIG_BONDING set?" >&2 + # only clean nsim ifaces and namespace. Nothing else has been + # initialized + cleanup_bond_nsim + trap - EXIT + exit "${ksft_skip}" + fi + + # create_netdevsim() got the interface up, but it needs to be down + # before being enslaved. + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX_MAIN_IF}" up + + # Setup RX + ip -n "${RXNS}" \ + link add "${BOND_RX_MAIN_IF}" type bond mode balance-rr + ip -n "${RXNS}" \ + link set "${BOND_RX1_SLAVE_IF}" down + ip -n "${RXNS}" \ + link set "${BOND_RX2_SLAVE_IF}" down + ip -n "${RXNS}" \ + link set "${BOND_RX1_SLAVE_IF}" master "${BOND_RX_MAIN_IF}" + ip -n "${RXNS}" \ + link set "${BOND_RX2_SLAVE_IF}" master "${BOND_RX_MAIN_IF}" + ip -n "${RXNS}" \ + link set "${BOND_RX_MAIN_IF}" up + + export DSTIF="${BOND_RX_MAIN_IF}" + export SRCIF="${BOND_TX_MAIN_IF}" +} + +# Create 4 netdevsim interfaces. Two of them will be bound to TX bonding iface +# and the other two will be bond to the RX interface (on the other namespace) +function create_ifaces_bond() { + BOND_TX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_1}" "${TXNS}") + BOND_TX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_2}" "${TXNS}") + BOND_RX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_1}" "${RXNS}") + BOND_RX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_2}" "${RXNS}") +} + +# netdevsim link BOND_TX to BOND_RX interfaces +function link_ifaces_bond() { + local BOND_TX1_SLAVE_IFIDX + local BOND_TX2_SLAVE_IFIDX + local BOND_RX1_SLAVE_IFIDX + local BOND_RX2_SLAVE_IFIDX + local TXNS_FD + local RXNS_FD + + BOND_TX1_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \ + cat /sys/class/net/"$BOND_TX1_SLAVE_IF"/ifindex) + BOND_TX2_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \ + cat /sys/class/net/"$BOND_TX2_SLAVE_IF"/ifindex) + BOND_RX1_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \ + cat /sys/class/net/"$BOND_RX1_SLAVE_IF"/ifindex) + BOND_RX2_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \ + cat /sys/class/net/"$BOND_RX2_SLAVE_IF"/ifindex) + + exec {TXNS_FD}</var/run/netns/"${TXNS}" + exec {RXNS_FD}</var/run/netns/"${RXNS}" + + # Linking TX ifaces to the RX ones (on the other namespace) + echo "${TXNS_FD}:$BOND_TX1_SLAVE_IFIDX $RXNS_FD:$BOND_RX1_SLAVE_IFIDX" \ + > "$NSIM_DEV_SYS_LINK" + echo "${TXNS_FD}:$BOND_TX2_SLAVE_IFIDX $RXNS_FD:$BOND_RX2_SLAVE_IFIDX" \ + > "$NSIM_DEV_SYS_LINK" + + exec {TXNS_FD}<&- + exec {RXNS_FD}<&- +} + +function create_all_ifaces() { + # setup_ns function is coming from lib.sh + setup_ns TXNS RXNS + export NAMESPACE="${RXNS}" + + # Create two interfaces for RX and two for TX + create_ifaces_bond + # Link netlink ifaces + link_ifaces_bond +} + +# configure DSTIF and SRCIF IPs +function configure_ifaces_ips() { + local IP_VERSION=${1:-"ipv4"} + select_ipv4_or_ipv6 "${IP_VERSION}" + + ip -n "${RXNS}" addr add "${DSTIP}"/24 dev "${DSTIF}" + ip -n "${RXNS}" link set "${DSTIF}" up + + ip -n "${TXNS}" addr add "${SRCIP}"/24 dev "${SRCIF}" + ip -n "${TXNS}" link set "${SRCIF}" up +} + +function test_enable_netpoll_on_enslaved_iface() { + echo 0 > "${NETCONS_PATH}"/enabled + + # At this stage, BOND_TX1_SLAVE_IF is enslaved to BOND_TX_MAIN_IF, and + # linked to BOND_RX1_SLAVE_IF inside the namespace. + echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name + + # This should fail with the following message in dmesg: + # netpoll: netconsole: ethX is a slave device, aborting + set +e + enable_netcons_ns 2> /dev/null + set -e + + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]] + then + echo "test failed: Bonding and netpoll cannot co-exists." >&2 + exit "${ksft_fail}" + fi +} + +function test_delete_bond_and_reenable_target() { + ip -n "${TXNS}" \ + link delete "${BOND_TX_MAIN_IF}" type bond + + # BOND_TX1_SLAVE_IF is not attached to a bond interface anymore + # netpoll can be plugged in there + echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name + + # this should work, since the interface is not enslaved + enable_netcons_ns + + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: Unable to start netpoll on an unbond iface." >&2 + exit "${ksft_fail}" + fi +} + +# Send a netconsole message to the netconsole target +function test_send_netcons_msg_through_bond_iface() { + # Listen for netconsole port inside the namespace and + # destination interface + listen_port_and_save_to "${OUTPUT_FILE}" "${IP_VERSION}" & + # Wait for socat to start and listen to the port. + wait_for_port "${RXNS}" "${PORT}" "${IP_VERSION}" + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + # Make sure the message was received in the dst part + # and exit + validate_result "${OUTPUT_FILE}" "${FORMAT}" + # kill socat in case it is still running + pkill_socat +} + +# BOND_TX1_SLAVE_IF has netconsole enabled on it, bind it to BOND_TX_MAIN_IF. +# Given BOND_TX_MAIN_IF was deleted, recreate it first +function test_enslave_netcons_enabled_iface { + # netconsole got disabled while the interface was down + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: netconsole expected to be enabled against BOND_TX1_SLAVE_IF" >&2 + exit "${ksft_fail}" + fi + + # recreate the bonding iface. it got deleted by previous + # test (test_delete_bond_and_reenable_target) + ip -n "${TXNS}" \ + link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr + + # sub-interface need to be down before attaching to bonding + # This will also disable netconsole. + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX_MAIN_IF}" up + + # netconsole got disabled while the interface was down + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]] + then + echo "test failed: Device is part of a bond iface, cannot have netcons enabled" >&2 + exit "${ksft_fail}" + fi +} + +# Get netconsole enabled on a bonding interface and attach a second +# sub-interface. +function test_enslave_iface_to_bond { + # BOND_TX_MAIN_IF has only BOND_TX1_SLAVE_IF right now + echo "${BOND_TX_MAIN_IF}" > "${NETCONS_PATH}"/dev_name + enable_netcons_ns + + # netcons is attached to bond0 and BOND_TX1_SLAVE_IF is + # part of BOND_TX_MAIN_IF. Attach BOND_TX2_SLAVE_IF to BOND_TX_MAIN_IF. + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: Netconsole should be enabled on bonding interface. Failed" >&2 + exit "${ksft_fail}" + fi +} + +function test_enslave_iff_disabled_netpoll_iface { + local ret + + # Create two interfaces. veth interfaces it known to have + # IFF_DISABLE_NETPOLL set + if ! ip link add "${VETH0}" type veth peer name "${VETH1}" + then + echo "Failed to create veth TX interface. Is CONFIG_VETH set?" >&2 + exit "${ksft_skip}" + fi + set +e + # This will print RTNETLINK answers: Device or resource busy + ip link set "${VETH0}" master "${BOND_TX_MAIN_IF}" 2> /dev/null + ret=$? + set -e + if [[ $ret -eq 0 ]] + then + echo "test failed: veth interface could not be enslaved" + exit "${ksft_fail}" + fi +} + +# Given that netconsole picks the current net namespace, we need to enable it +# from inside the TXNS namespace +function enable_netcons_ns() { + ip netns exec "${TXNS}" sh -c \ + "mount -t configfs configfs /sys/kernel/config && echo 1 > $NETCONS_PATH/enabled" +} + +#################### +# Tests start here # +#################### + +# Create regular interfaces using netdevsim and link them +create_all_ifaces + +# Setup the bonding interfaces +# BOND_RX_MAIN_IF has BOND_RX{1,2}_SLAVE_IF +# BOND_TX_MAIN_IF has BOND_TX{1,2}_SLAVE_IF +setup_bonding_ifaces + +# Configure the ips as BOND_RX1_SLAVE_IF and BOND_TX1_SLAVE_IF +configure_ifaces_ips "${IP_VERSION}" + +_create_dynamic_target "${FORMAT}" "${NETCONS_PATH}" +enable_netcons_ns +set_user_data + +# Test #1 : Create an bonding interface and attach netpoll into +# the bonding interface. Netconsole/netpoll should work on +# the bonding interface. +test_send_netcons_msg_through_bond_iface +echo "test #1: netpoll on bonding interface worked. Test passed" >&2 + +# Test #2: Attach netpoll to an enslaved interface +# Try to attach netpoll to an enslaved sub-interface (while still being part of +# a bonding interface), which shouldn't be allowed +test_enable_netpoll_on_enslaved_iface +echo "test #2: netpoll correctly rejected enslaved interface (expected behavior). Test passed." >&2 + +# Test #3: Unplug the sub-interface from bond and enable netconsole +# Detach the interface from a bonding interface and attach netpoll again +test_delete_bond_and_reenable_target +echo "test #3: Able to attach to an unbound interface. Test passed." >&2 + +# Test #4: Enslave a sub-interface that had netconsole enabled +# Try to enslave an interface that has netconsole/netpoll enabled. +# Previous test has netconsole enabled in BOND_TX1_SLAVE_IF, try to enslave it +test_enslave_netcons_enabled_iface +echo "test #4: Enslaving an interface with netpoll attached. Test passed." >&2 + +# Test #5: Enslave a sub-interface to a bonding interface +# Enslave an interface to a bond interface that has netpoll attached +# At this stage, BOND_TX_MAIN_IF is created and BOND_TX1_SLAVE_IF is part of +# it. Netconsole is currently disabled +test_enslave_iface_to_bond +echo "test #5: Enslaving an interface to bond+netpoll. Test passed." >&2 + +# Test #6: Enslave a IFF_DISABLE_NETPOLL sub-interface to a bonding interface +# At this stage, BOND_TX_MAIN_IF has both sub interface and netconsole is +# enabled. This test will try to enslave an a veth (IFF_DISABLE_NETPOLL) interface +# and it should fail, with netpoll: veth0 doesn't support polling +test_enslave_iff_disabled_netpoll_iface +echo "test #6: Enslaving IFF_DISABLE_NETPOLL ifaces to bond iface is not supported. Test passed." >&2 + +cleanup_bond +trap - EXIT +exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index 8e1085e89647..87f89fd92f8c 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -11,9 +11,11 @@ set -euo pipefail LIBDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") SRCIF="" # to be populated later +SRCIP="" # to be populated later SRCIP4="192.0.2.1" SRCIP6="fc00::1" DSTIF="" # to be populated later +DSTIP="" # to be populated later DSTIP4="192.0.2.2" DSTIP6="fc00::2" @@ -28,17 +30,23 @@ NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}" # NAMESPACE will be populated by setup_ns with a random value NAMESPACE="" -# IDs for netdevsim +# IDs for netdevsim. We either use NSIM_DEV_{1,2}_ID for standard test +# or NSIM_BOND_{T,R}X_{1,2} for the bonding tests. Not both at the +# same time. NSIM_DEV_1_ID=$((256 + RANDOM % 256)) NSIM_DEV_2_ID=$((512 + RANDOM % 256)) +NSIM_BOND_TX_1=$((768 + RANDOM % 256)) +NSIM_BOND_TX_2=$((1024 + RANDOM % 256)) +NSIM_BOND_RX_1=$((1280 + RANDOM % 256)) +NSIM_BOND_RX_2=$((1536 + RANDOM % 256)) NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device" +NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device" # Used to create and delete namespaces source "${LIBDIR}"/../../../../net/lib.sh # Create netdevsim interfaces create_ifaces() { - echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW" echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW" udevadm settle 2> /dev/null || true @@ -113,31 +121,38 @@ function set_network() { configure_ip } -function create_dynamic_target() { - local FORMAT=${1:-"extended"} +function _create_dynamic_target() { + local FORMAT="${1:?FORMAT parameter required}" + local NCPATH="${2:?NCPATH parameter required}" DSTMAC=$(ip netns exec "${NAMESPACE}" \ ip link show "${DSTIF}" | awk '/ether/ {print $2}') # Create a dynamic target - mkdir "${NETCONS_PATH}" + mkdir "${NCPATH}" - echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip - echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip - echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac - echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name + echo "${DSTIP}" > "${NCPATH}"/remote_ip + echo "${SRCIP}" > "${NCPATH}"/local_ip + echo "${DSTMAC}" > "${NCPATH}"/remote_mac + echo "${SRCIF}" > "${NCPATH}"/dev_name if [ "${FORMAT}" == "basic" ] then # Basic target does not support release - echo 0 > "${NETCONS_PATH}"/release - echo 0 > "${NETCONS_PATH}"/extended + echo 0 > "${NCPATH}"/release + echo 0 > "${NCPATH}"/extended elif [ "${FORMAT}" == "extended" ] then - echo 1 > "${NETCONS_PATH}"/extended + echo 1 > "${NCPATH}"/extended fi +} - echo 1 > "${NETCONS_PATH}"/enabled +function create_dynamic_target() { + local FORMAT=${1:-"extended"} + local NCPATH=${2:-"$NETCONS_PATH"} + _create_dynamic_target "${FORMAT}" "${NCPATH}" + + echo 1 > "${NCPATH}"/enabled # This will make sure that the kernel was able to # load the netconsole driver configuration. The console message @@ -185,14 +200,26 @@ function do_cleanup() { echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk } -function cleanup() { +function cleanup_netcons() { # delete netconsole dynamic reconfiguration - echo 0 > "${NETCONS_PATH}"/enabled + # do not fail if the target is already disabled + if [[ ! -d "${NETCONS_PATH}" ]] + then + # in some cases this is called before netcons path is created + return + fi + if [[ $(cat "${NETCONS_PATH}"/enabled) != 0 ]] + then + echo 0 > "${NETCONS_PATH}"/enabled || true + fi # Remove all the keys that got created during the selftest find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete # Remove the configfs entry rmdir "${NETCONS_PATH}" +} +function cleanup() { + cleanup_netcons do_cleanup } @@ -369,3 +396,24 @@ function wait_for_port() { # more frequently on IPv6 sleep 1 } + +# Clean up netdevsim ifaces created for bonding test +function cleanup_bond_nsim() { + ip -n "${TXNS}" \ + link delete "${BOND_TX_MAIN_IF}" type bond || true + ip -n "${RXNS}" \ + link delete "${BOND_RX_MAIN_IF}" type bond || true + + cleanup_netdevsim "$NSIM_BOND_TX_1" + cleanup_netdevsim "$NSIM_BOND_TX_2" + cleanup_netdevsim "$NSIM_BOND_RX_1" + cleanup_netdevsim "$NSIM_BOND_RX_2" +} + +# cleanup tests that use bonding interfaces +function cleanup_bond() { + cleanup_netcons + cleanup_bond_nsim + cleanup_all_ns + ip link delete "${VETH0}" || true +} diff --git a/tools/testing/selftests/drivers/net/netcons_torture.sh b/tools/testing/selftests/drivers/net/netcons_torture.sh new file mode 100755 index 000000000000..2ce9ee3719d1 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_torture.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# Repeatedly send kernel messages, toggles netconsole targets on and off, +# creates and deletes targets in parallel, and toggles the source interface to +# simulate stress conditions. +# +# This test aims to verify the robustness of netconsole under dynamic +# configurations and concurrent operations. +# +# The major goal is to run this test with LOCKDEP, Kmemleak and KASAN to make +# sure no issues is reported. +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh + +# Number of times the main loop run +ITERATIONS=${1:-150} + +# Only test extended format +FORMAT="extended" +# And ipv6 only +IP_VERSION="ipv6" + +# Create, enable and delete some targets. +create_and_delete_random_target() { + COUNT=2 + RND_PREFIX=$(mktemp -u netcons_rnd_XXXX_) + + if [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}${COUNT}" ] || \ + [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}0" ]; then + echo "Function didn't finish yet, skipping it." >&2 + return + fi + + # enable COUNT targets + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + + # Basic population so the target can come up + _create_dynamic_target "${FORMAT}" "${RND_TARGET_PATH}" + done + + echo "netconsole selftest: ${COUNT} additional targets were created" > /dev/kmsg + # disable them all + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + if [[ $(cat "${RND_TARGET_PATH}/enabled") -eq 1 ]] + then + echo 0 > "${RND_TARGET_PATH}"/enabled + fi + rmdir "${RND_TARGET_PATH}" + done +} + +# Disable and enable the target mid-air, while messages +# are being transmitted. +toggle_netcons_target() { + for i in $(seq 2) + do + if [ ! -d "${NETCONS_PATH}" ] + then + break + fi + echo 0 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + # Try to enable a bit harder, given it might fail to enable + # Write to `enabled` might fail depending on the lock, which is + # highly contentious here + for _ in $(seq 5) + do + echo 1 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + done + done +} + +toggle_iface(){ + ip link set "${SRCIF}" down + ip link set "${SRCIF}" up +} + +# Start here + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network "${IP_VERSION}" +# Create a dynamic target for netconsole +create_dynamic_target "${FORMAT}" + +for i in $(seq "$ITERATIONS") +do + for _ in $(seq 10) + do + echo "${MSG}: ${TARGET} ${i}" > /dev/kmsg + done + wait + + if (( i % 30 == 0 )); then + toggle_netcons_target & + fi + + if (( i % 50 == 0 )); then + # create some targets, enable them, send msg and disable + # all in a parallel thread + create_and_delete_random_target & + fi + + if (( i % 70 == 0 )); then + toggle_iface & + fi +done +wait + +exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/netdevsim/Makefile b/tools/testing/selftests/drivers/net/netdevsim/Makefile index daf51113c827..df10c7243511 100644 --- a/tools/testing/selftests/drivers/net/netdevsim/Makefile +++ b/tools/testing/selftests/drivers/net/netdevsim/Makefile @@ -20,4 +20,8 @@ TEST_PROGS := \ udp_tunnel_nic.sh \ # end of TEST_PROGS +TEST_FILES := \ + ethtool-common.sh +# end of TEST_FILES + include ../../../lib.mk diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c index c43a69dffd83..a0c64f415a7f 100644 --- a/tools/testing/selftests/filesystems/utils.c +++ b/tools/testing/selftests/filesystems/utils.c @@ -487,7 +487,7 @@ int setup_userns(void) uid_t uid = getuid(); gid_t gid = getgid(); - ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID); + ret = unshare(CLONE_NEWNS|CLONE_NEWUSER); if (ret) { ksft_exit_fail_msg("unsharing mountns and userns: %s\n", strerror(errno)); diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc index c62165fabd0c..cfa16aa1f39a 100644 --- a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc +++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc @@ -20,6 +20,10 @@ sample_events() { echo 0 > tracing_on echo 0 > events/enable +# Clear functions caused by page cache; run sample_events twice +sample_events +sample_events + echo "Get the most frequently calling function" echo > trace sample_events diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 3eebf5e3b974..bb4d33dde3c8 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -2638,6 +2638,8 @@ TEST_F(vfio_compat_mock_domain, map) ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); ASSERT_EQ(BUFFER_SIZE, unmap_cmd.size); + /* Unmap of empty is success */ + ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); /* UNMAP_FLAG_ALL requires 0 iova/size */ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 772ca1db6e59..9f472c20c190 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -1044,8 +1044,8 @@ static int _test_cmd_trigger_vevents(int fd, __u32 dev_id, __u32 nvevents) }; while (nvevents--) { - if (!ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_VEVENT), - &trigger_vevent_cmd)) + if (ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_VEVENT), + &trigger_vevent_cmd)) return -1; } return 0; diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c index c9b84eeaab6b..0a3a94c4cca1 100644 --- a/tools/testing/selftests/kvm/arm64/get-reg-list.c +++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c @@ -63,11 +63,13 @@ static struct feature_id_reg feat_id_regs[] = { REG_FEAT(HDFGWTR2_EL2, ID_AA64MMFR0_EL1, FGT, FGT2), REG_FEAT(ZCR_EL2, ID_AA64PFR0_EL1, SVE, IMP), REG_FEAT(SCTLR2_EL1, ID_AA64MMFR3_EL1, SCTLRX, IMP), + REG_FEAT(SCTLR2_EL2, ID_AA64MMFR3_EL1, SCTLRX, IMP), REG_FEAT(VDISR_EL2, ID_AA64PFR0_EL1, RAS, IMP), REG_FEAT(VSESR_EL2, ID_AA64PFR0_EL1, RAS, IMP), REG_FEAT(VNCR_EL2, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY), REG_FEAT(CNTHV_CTL_EL2, ID_AA64MMFR1_EL1, VH, IMP), REG_FEAT(CNTHV_CVAL_EL2,ID_AA64MMFR1_EL1, VH, IMP), + REG_FEAT(ZCR_EL2, ID_AA64PFR0_EL1, SVE, IMP), }; bool filter_reg(__u64 reg) @@ -718,6 +720,7 @@ static __u64 el2_regs[] = { SYS_REG(VMPIDR_EL2), SYS_REG(SCTLR_EL2), SYS_REG(ACTLR_EL2), + SYS_REG(SCTLR2_EL2), SYS_REG(HCR_EL2), SYS_REG(MDCR_EL2), SYS_REG(CPTR_EL2), diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c index 5e24f77868b5..c4815d365816 100644 --- a/tools/testing/selftests/kvm/arm64/set_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c @@ -268,7 +268,9 @@ static void guest_code(void) /* Return a safe value to a given ftr_bits an ftr value */ uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) { - uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); + uint64_t ftr_max = ftr_bits->mask >> ftr_bits->shift; + + TEST_ASSERT(ftr_max > 1, "This test doesn't support single bit features"); if (ftr_bits->sign == FTR_UNSIGNED) { switch (ftr_bits->type) { @@ -320,7 +322,9 @@ uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) /* Return an invalid value to a given ftr_bits an ftr value */ uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) { - uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); + uint64_t ftr_max = ftr_bits->mask >> ftr_bits->shift; + + TEST_ASSERT(ftr_max > 1, "This test doesn't support single bit features"); if (ftr_bits->sign == FTR_UNSIGNED) { switch (ftr_bits->type) { @@ -672,7 +676,7 @@ static void test_clidr(struct kvm_vcpu *vcpu) clidr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1)); /* find the first empty level in the cache hierarchy */ - for (level = 1; level < 7; level++) { + for (level = 1; level <= 7; level++) { if (!CLIDR_CTYPE(clidr, level)) break; } diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c index 09f270545646..0e2f8ed90f30 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c @@ -15,6 +15,8 @@ #include "gic_v3.h" #include "processor.h" +#define GITS_COLLECTION_TARGET_SHIFT 16 + static u64 its_read_u64(unsigned long offset) { return readq_relaxed(GITS_BASE_GVA + offset); @@ -163,6 +165,11 @@ static void its_encode_collection(struct its_cmd_block *cmd, u16 col) its_mask_encode(&cmd->raw_cmd[2], col, 15, 0); } +static u64 procnum_to_rdbase(u32 vcpu_id) +{ + return vcpu_id << GITS_COLLECTION_TARGET_SHIFT; +} + #define GITS_CMDQ_POLL_ITERATIONS 0 static void its_send_cmd(void *cmdq_base, struct its_cmd_block *cmd) @@ -217,7 +224,7 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val its_encode_cmd(&cmd, GITS_CMD_MAPC); its_encode_collection(&cmd, collection_id); - its_encode_target(&cmd, vcpu_id); + its_encode_target(&cmd, procnum_to_rdbase(vcpu_id)); its_encode_valid(&cmd, valid); its_send_cmd(cmdq_base, &cmd); diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 9e3be2ee7f1b..f917b4c4c943 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -1758,10 +1758,15 @@ int main(int argc, char *argv[]) uffd_test_ops = mem_type->mem_ops; uffd_test_case_ops = test->test_case_ops; - if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) + if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) { gopts.page_size = default_huge_page_size(); - else + if (gopts.page_size == 0) { + uffd_test_skip("huge page size is 0, feature missing?"); + continue; + } + } else { gopts.page_size = psize(); + } /* Ensure we have at least 2 pages */ gopts.nr_pages = MAX(UFFD_TEST_MEM_SIZE, gopts.page_size * 2) @@ -1776,12 +1781,6 @@ int main(int argc, char *argv[]) continue; uffd_test_start("%s on %s", test->name, mem_type->name); - if ((mem_type->mem_flag == MEM_HUGETLB || - mem_type->mem_flag == MEM_HUGETLB_PRIVATE) && - (default_huge_page_size() == 0)) { - uffd_test_skip("huge page size is 0, feature missing?"); - continue; - } if (!uffd_feature_supported(test)) { uffd_test_skip("feature missing"); continue; diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore index ccfb40837a73..0989e80da457 100644 --- a/tools/testing/selftests/namespaces/.gitignore +++ b/tools/testing/selftests/namespaces/.gitignore @@ -1,3 +1,12 @@ nsid_test file_handle_test init_ino_test +ns_active_ref_test +listns_test +listns_permissions_test +listns_efault_test +siocgskns_test +cred_change_test +stress_test +listns_pagination_bug +regression_pidfd_setns_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile index 5fe4b3dc07d3..fbb821652c17 100644 --- a/tools/testing/selftests/namespaces/Makefile +++ b/tools/testing/selftests/namespaces/Makefile @@ -1,7 +1,29 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +LDLIBS += -lcap -TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test +TEST_GEN_PROGS := nsid_test \ + file_handle_test \ + init_ino_test \ + ns_active_ref_test \ + listns_test \ + listns_permissions_test \ + listns_efault_test \ + siocgskns_test \ + cred_change_test \ + stress_test \ + listns_pagination_bug \ + regression_pidfd_setns_test include ../lib.mk +$(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c +$(OUTPUT)/listns_test: ../filesystems/utils.c +$(OUTPUT)/listns_permissions_test: ../filesystems/utils.c +$(OUTPUT)/listns_efault_test: ../filesystems/utils.c +$(OUTPUT)/siocgskns_test: ../filesystems/utils.c +$(OUTPUT)/cred_change_test: ../filesystems/utils.c +$(OUTPUT)/stress_test: ../filesystems/utils.c +$(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c +$(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c + diff --git a/tools/testing/selftests/namespaces/cred_change_test.c b/tools/testing/selftests/namespaces/cred_change_test.c new file mode 100644 index 000000000000..7b4f5ad3f725 --- /dev/null +++ b/tools/testing/selftests/namespaces/cred_change_test.c @@ -0,0 +1,814 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/capability.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/nsfs.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test credential changes and their impact on namespace active references. + */ + +/* + * Test setuid() in a user namespace properly swaps active references. + * Create a user namespace with multiple UIDs mapped, then setuid() between them. + * Verify that the user namespace remains active throughout. + */ +TEST(setuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setuid_count; + + close(pipefd[0]); + + /* Create new user namespace with multiple UIDs mapped (0-9) */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send namespace ID to parent */ + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* + * Perform multiple setuid() calls. + * Each setuid() triggers commit_creds() which should properly + * swap active references via switch_cred_namespaces(). + */ + for (setuid_count = 0; setuid_count < 50; setuid_count++) { + uid_t target_uid = (setuid_count % 10); + if (setuid(target_uid) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + TH_LOG("Child user namespace ID: %llu", (unsigned long long)userns_id); + + /* Verify namespace is active while child is running */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + ASSERT_TRUE(found); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive after child exits */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setuid() correctly preserved active references (no leak)"); +} + +/* + * Test setgid() in a user namespace properly handles active references. + */ +TEST(setgid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setgid_count; + + close(pipefd[0]); + + /* Create new user namespace with multiple GIDs mapped */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setgid() calls */ + for (setgid_count = 0; setgid_count < 50; setgid_count++) { + gid_t target_gid = (setgid_count % 10); + if (setgid(target_gid) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setgid() correctly preserved active references (no leak)"); +} + +/* + * Test setresuid() which changes real, effective, and saved UIDs. + * This should properly swap active references via commit_creds(). + */ +TEST(setresuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setres_count; + + close(pipefd[0]); + + /* Create new user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setresuid() calls */ + for (setres_count = 0; setres_count < 30; setres_count++) { + uid_t uid1 = (setres_count % 5); + uid_t uid2 = ((setres_count + 1) % 5); + uid_t uid3 = ((setres_count + 2) % 5); + + if (setresuid(uid1, uid2, uid3) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setresuid() correctly preserved active references (no leak)"); +} + +/* + * Test credential changes across multiple user namespaces. + * Create nested user namespaces and verify active reference tracking. + */ +TEST(cred_change_nested_userns) +{ + pid_t pid; + int status; + __u64 parent_userns_id, child_userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found_parent = false, found_child = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 parent_id, child_id; + uid_t orig_uid = getuid(); + + close(pipefd[0]); + + /* Create first user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 1); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get first namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create nested user namespace */ + userns_fd = get_userns_fd(0, 0, 1); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get nested namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send both IDs to parent */ + write(pipefd[1], &parent_id, sizeof(parent_id)); + write(pipefd[1], &child_id, sizeof(child_id)); + + /* Perform some credential changes in nested namespace */ + setuid(0); + setgid(0); + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + /* Read both namespace IDs */ + if (read(pipefd[0], &parent_userns_id, sizeof(parent_userns_id)) != sizeof(parent_userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get parent namespace ID"); + } + + if (read(pipefd[0], &child_userns_id, sizeof(child_userns_id)) != sizeof(child_userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get child namespace ID"); + } + close(pipefd[0]); + + TH_LOG("Parent userns: %llu, Child userns: %llu", + (unsigned long long)parent_userns_id, + (unsigned long long)child_userns_id); + + /* Verify both namespaces are active */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == parent_userns_id) + found_parent = true; + if (ns_ids[i] == child_userns_id) + found_child = true; + } + + ASSERT_TRUE(found_parent); + ASSERT_TRUE(found_child); + + /* Wait for child */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify both namespaces become inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_parent = false; + found_child = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == parent_userns_id) + found_parent = true; + if (ns_ids[i] == child_userns_id) + found_child = true; + } + + ASSERT_FALSE(found_parent); + ASSERT_FALSE(found_child); + TH_LOG("Nested user namespace credential changes preserved active refs (no leak)"); +} + +/* + * Test rapid credential changes don't cause refcount imbalances. + * This stress-tests the switch_cred_namespaces() logic. + */ +TEST(rapid_cred_changes_no_leak) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int change_count; + + close(pipefd[0]); + + /* Create new user namespace with wider range of UIDs/GIDs */ + userns_fd = get_userns_fd(0, orig_uid, 100); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* + * Perform many rapid credential changes. + * Mix setuid, setgid, setreuid, setregid, setresuid, setresgid. + */ + for (change_count = 0; change_count < 200; change_count++) { + switch (change_count % 6) { + case 0: + setuid(change_count % 50); + break; + case 1: + setgid(change_count % 50); + break; + case 2: + setreuid(change_count % 50, (change_count + 1) % 50); + break; + case 3: + setregid(change_count % 50, (change_count + 1) % 50); + break; + case 4: + setresuid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50); + break; + case 5: + setresgid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50); + break; + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + TH_LOG("Testing with user namespace ID: %llu", (unsigned long long)userns_id); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive (no leaked active refs) */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("200 rapid credential changes completed with no active ref leak"); +} + +/* + * Test setfsuid/setfsgid which change filesystem UID/GID. + * These also trigger credential changes but may have different code paths. + */ +TEST(setfsuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int change_count; + + close(pipefd[0]); + + /* Create new user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setfsuid/setfsgid calls */ + for (change_count = 0; change_count < 50; change_count++) { + setfsuid(change_count % 10); + setfsgid(change_count % 10); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setfsuid/setfsgid correctly preserved active references (no leak)"); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c new file mode 100644 index 000000000000..c7ed4023d7a8 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_efault_test.c @@ -0,0 +1,530 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "../pidfd/pidfd.h" +#include "wrappers.h" + +/* + * Test listns() error handling with invalid buffer addresses. + * + * When the buffer pointer is invalid (e.g., crossing page boundaries + * into unmapped memory), listns() returns EINVAL. + * + * This test also creates mount namespaces that get destroyed during + * iteration, testing that namespace cleanup happens outside the RCU + * read lock. + */ +TEST(listns_partial_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[5]; + int sv[5][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* + * Map two pages: + * - First page: readable and writable + * - Second page: will be unmapped to trigger EFAULT + */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position the buffer pointer so there's room for exactly one u64 + * before the page boundary. The second u64 would fall into the + * unmapped page. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 1; + + /* + * Create a separate process to run listns() in a loop concurrently + * with namespace creation and destruction. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Global listing */ + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * The kernel should: + * 1. Successfully write the first namespace ID (within valid page) + * 2. Fail with EFAULT when trying to write the second ID (unmapped page) + * 3. Handle concurrent namespace destruction without deadlock + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 2, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create several child processes, each in its own mount namespace. + * These will be destroyed while the iterator is running listns(). + */ + for (i = 0; i < 5; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Child: create a couple of tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 5; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* + * Signal children to exit. This will destroy their mount namespaces + * while listns() is iterating the namespace tree. + * This tests that cleanup happens outside the RCU read lock. + */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all mount namespace children to exit and cleanup */ + for (i = 0; i < 5; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test listns() error handling when the entire buffer is invalid. + * This is a sanity check that basic invalid pointer detection works. + */ +TEST(listns_complete_fault) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 *ns_ids; + ssize_t ret; + + /* Use a clearly invalid pointer */ + ns_ids = (__u64 *)0xdeadbeef; + + ret = sys_listns(&req, ns_ids, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() error handling when the buffer is NULL. + */ +TEST(listns_null_buffer) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + ssize_t ret; + + /* NULL buffer with non-zero count should fail */ + ret = sys_listns(&req, NULL, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() with a buffer that becomes invalid mid-iteration + * (after several successful writes), combined with mount namespace + * destruction to test RCU cleanup logic. + */ +TEST(listns_late_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[10]; + int sv[10][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Map two pages */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position buffer so we can write several u64s successfully + * before hitting the page boundary. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 5; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Request 10 namespace IDs while namespaces are being destroyed. + * This tests: + * 1. EFAULT handling when buffer becomes invalid + * 2. Namespace cleanup outside RCU read lock during iteration + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create more children with mount namespaces to increase the + * likelihood that namespace cleanup happens during iteration. + */ + for (i = 0; i < 10; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Child: create tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 10; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill half the children */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Small delay to let some exit */ + usleep(10000); + + /* Kill remaining children */ + for (i = 5; i < 10; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all children and cleanup */ + for (i = 0; i < 10; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test specifically focused on mount namespace cleanup during EFAULT. + * Filter for mount namespaces only. + */ +TEST(listns_mnt_ns_cleanup_on_fault) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[8]; + int sv[8][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Set up partial fault buffer */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* Position for 3 successful writes, then fault */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 3; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNS, /* Only mount namespaces */ + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Call listns() to race with namespace destruction. + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* Create children with mount namespaces */ + for (i = 0; i < 8; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Do some mount operations to make cleanup more interesting */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 8; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill children to trigger namespace destruction during iteration */ + for (i = 0; i < 8; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for children and cleanup */ + for (i = 0; i < 8; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + munmap(map, page_size); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_pagination_bug.c b/tools/testing/selftests/namespaces/listns_pagination_bug.c new file mode 100644 index 000000000000..da7d33f96397 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_pagination_bug.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/socket.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Minimal test case to reproduce KASAN out-of-bounds in listns pagination. + * + * The bug occurs when: + * 1. Filtering by a specific namespace type (e.g., CLONE_NEWUSER) + * 2. Using pagination (req.ns_id != 0) + * 3. The lookup_ns_id_at() call in do_listns() passes ns_type=0 instead of + * the filtered type, causing it to search the unified tree and potentially + * return a namespace of the wrong type. + */ +TEST(pagination_with_type_filter) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, /* Filter by user namespace */ + .spare2 = 0, + .user_ns_id = 0, + }; + pid_t pids[10]; + int num_children = 10; + int i; + int sv[2]; + __u64 first_batch[3]; + ssize_t ret; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create children with user namespaces */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + char c; + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* First batch - this should work */ + ret = sys_listns(&req, first_batch, 3, 0); + if (ret < 0) { + if (errno == ENOSYS) { + close(sv[0]); + for (i = 0; i < num_children; i++) + kill(pids[i], SIGKILL); + for (i = 0; i < num_children; i++) + waitpid(pids[i], NULL, 0); + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + TH_LOG("First batch returned %zd entries", ret); + + if (ret == 3) { + __u64 second_batch[3]; + + /* Second batch - pagination triggers the bug */ + req.ns_id = first_batch[2]; /* Continue from last ID */ + ret = sys_listns(&req, second_batch, 3, 0); + + TH_LOG("Second batch returned %zd entries", ret); + ASSERT_GE(ret, 0); + } + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Cleanup */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_permissions_test.c b/tools/testing/selftests/namespaces/listns_permissions_test.c new file mode 100644 index 000000000000..82d818751a5f --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_permissions_test.c @@ -0,0 +1,759 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/capability.h> +#include <sys/ioctl.h> +#include <sys/prctl.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test that unprivileged users can only see namespaces they're currently in. + * Create a namespace, drop privileges, verify we can only see our own namespaces. + */ +TEST(listns_unprivileged_current_only) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + int pipefd[2]; + pid_t pid; + int status; + bool found_ours; + int unexpected_count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 our_netns_id; + bool found_ours; + int unexpected_count; + + close(pipefd[0]); + + /* Create user namespace to be unprivileged */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create a network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get our network namespace ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &our_netns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Now we're unprivileged - list all network namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* We should only see our own network namespace */ + found_ours = false; + unexpected_count = 0; + + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == our_netns_id) { + found_ours = true; + } else { + /* This is either init_net (which we can see) or unexpected */ + unexpected_count++; + } + } + + /* Send results to parent */ + write(pipefd[1], &found_ours, sizeof(found_ours)); + write(pipefd[1], &unexpected_count, sizeof(unexpected_count)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_ours = false; + unexpected_count = 0; + read(pipefd[0], &found_ours, sizeof(found_ours)); + read(pipefd[0], &unexpected_count, sizeof(unexpected_count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Child should have seen its own namespace */ + ASSERT_TRUE(found_ours); + + TH_LOG("Unprivileged child saw its own namespace, plus %d others (likely init_net)", + unexpected_count); +} + +/* + * Test that users with CAP_SYS_ADMIN in a user namespace can see + * all namespaces owned by that user namespace. + */ +TEST(listns_cap_sys_admin_in_userns) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Will be set to our created user namespace */ + }; + __u64 ns_ids[100]; + int pipefd[2]; + pid_t pid; + int status; + bool success; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 userns_id; + ssize_t ret; + int min_expected; + bool success; + + close(pipefd[0]); + + /* Create user namespace - we'll have CAP_SYS_ADMIN in it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get the user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create several namespaces owned by this user namespace */ + unshare(CLONE_NEWNET); + unshare(CLONE_NEWUTS); + unshare(CLONE_NEWIPC); + + /* List namespaces owned by our user namespace */ + req.user_ns_id = userns_id; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* + * We have CAP_SYS_ADMIN in this user namespace, + * so we should see all namespaces owned by it. + * That includes: net, uts, ipc, and the user namespace itself. + */ + min_expected = 4; + success = (ret >= min_expected); + + write(pipefd[1], &success, sizeof(success)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + success = false; + count = 0; + read(pipefd[0], &success, sizeof(success)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(success); + TH_LOG("User with CAP_SYS_ADMIN saw %zd namespaces owned by their user namespace", + count); +} + +/* + * Test that users cannot see namespaces from unrelated user namespaces. + * Create two sibling user namespaces, verify they can't see each other's + * owned namespaces. + */ +TEST(listns_cannot_see_sibling_userns_namespaces) +{ + int pipefd[2]; + pid_t pid1, pid2; + int status; + __u64 netns_a_id; + int pipefd2[2]; + bool found_sibling_netns; + + ASSERT_EQ(pipe(pipefd), 0); + + /* Fork first child - creates user namespace A */ + pid1 = fork(); + ASSERT_GE(pid1, 0); + + if (pid1 == 0) { + int fd; + __u64 netns_a_id; + char buf; + + close(pipefd[0]); + + /* Create user namespace A */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create network namespace owned by user namespace A */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get network namespace ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &netns_a_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send namespace ID to parent */ + write(pipefd[1], &netns_a_id, sizeof(netns_a_id)); + + /* Keep alive for sibling to check */ + read(pipefd[1], &buf, 1); + close(pipefd[1]); + exit(0); + } + + /* Parent reads namespace A ID */ + close(pipefd[1]); + netns_a_id = 0; + read(pipefd[0], &netns_a_id, sizeof(netns_a_id)); + + TH_LOG("User namespace A created network namespace with ID %llu", + (unsigned long long)netns_a_id); + + /* Fork second child - creates user namespace B */ + ASSERT_EQ(pipe(pipefd2), 0); + + pid2 = fork(); + ASSERT_GE(pid2, 0); + + if (pid2 == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + bool found_sibling_netns; + + close(pipefd[0]); + close(pipefd2[0]); + + /* Create user namespace B (sibling to A) */ + if (setup_userns() < 0) { + close(pipefd2[1]); + exit(1); + } + + /* Try to list all network namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + found_sibling_netns = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == netns_a_id) { + found_sibling_netns = true; + break; + } + } + } + + /* We should NOT see the sibling's network namespace */ + write(pipefd2[1], &found_sibling_netns, sizeof(found_sibling_netns)); + close(pipefd2[1]); + exit(0); + } + + /* Parent reads result from second child */ + close(pipefd2[1]); + found_sibling_netns = false; + read(pipefd2[0], &found_sibling_netns, sizeof(found_sibling_netns)); + close(pipefd2[0]); + + /* Signal first child to exit */ + close(pipefd[0]); + + /* Wait for both children */ + waitpid(pid2, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + waitpid(pid1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* Second child should NOT have seen first child's namespace */ + ASSERT_FALSE(found_sibling_netns); + TH_LOG("User namespace B correctly could not see sibling namespace A's network namespace"); +} + +/* + * Test permission checking with LISTNS_CURRENT_USER. + * Verify that listing with LISTNS_CURRENT_USER respects permissions. + */ +TEST(listns_current_user_permissions) +{ + int pipefd[2]; + pid_t pid; + int status; + bool success; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids[100]; + ssize_t ret; + bool success; + + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create some namespaces owned by this user namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + /* List with LISTNS_CURRENT_USER - should see our owned namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + success = (ret >= 3); /* At least user, net, uts */ + write(pipefd[1], &success, sizeof(success)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + success = false; + count = 0; + read(pipefd[0], &success, sizeof(success)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(success); + TH_LOG("LISTNS_CURRENT_USER returned %zd namespaces", count); +} + +/* + * Test that CAP_SYS_ADMIN in parent user namespace allows seeing + * child user namespace's owned namespaces. + */ +TEST(listns_parent_userns_cap_sys_admin) +{ + int pipefd[2]; + pid_t pid; + int status; + bool found_child_userns; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 parent_userns_id; + __u64 child_userns_id; + struct ns_id_req req; + __u64 ns_ids[100]; + ssize_t ret; + bool found_child_userns; + + close(pipefd[0]); + + /* Create parent user namespace - we have CAP_SYS_ADMIN in it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get parent user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get child user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create namespaces owned by child user namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* List namespaces owned by parent user namespace */ + req.size = sizeof(req); + req.spare = 0; + req.ns_id = 0; + req.ns_type = 0; + req.spare2 = 0; + req.user_ns_id = parent_userns_id; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + /* Should see child user namespace in the list */ + found_child_userns = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == child_userns_id) { + found_child_userns = true; + break; + } + } + } + + write(pipefd[1], &found_child_userns, sizeof(found_child_userns)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_child_userns = false; + count = 0; + read(pipefd[0], &found_child_userns, sizeof(found_child_userns)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(found_child_userns); + TH_LOG("Process with CAP_SYS_ADMIN in parent user namespace saw child user namespace (total: %zd)", + count); +} + +/* + * Test that we can see user namespaces we have CAP_SYS_ADMIN inside of. + * This is different from seeing namespaces owned by a user namespace. + */ +TEST(listns_cap_sys_admin_inside_userns) +{ + int pipefd[2]; + pid_t pid; + int status; + bool found_ours; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 our_userns_id; + struct ns_id_req req; + __u64 ns_ids[100]; + ssize_t ret; + bool found_ours; + + close(pipefd[0]); + + /* Create user namespace - we have CAP_SYS_ADMIN inside it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get our user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &our_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* List all user namespaces globally */ + req.size = sizeof(req); + req.spare = 0; + req.ns_id = 0; + req.ns_type = CLONE_NEWUSER; + req.spare2 = 0; + req.user_ns_id = 0; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + /* We should be able to see our own user namespace */ + found_ours = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == our_userns_id) { + found_ours = true; + break; + } + } + } + + write(pipefd[1], &found_ours, sizeof(found_ours)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_ours = false; + read(pipefd[0], &found_ours, sizeof(found_ours)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(found_ours); + TH_LOG("Process can see user namespace it has CAP_SYS_ADMIN inside of"); +} + +/* + * Test that dropping CAP_SYS_ADMIN restricts what we can see. + */ +TEST(listns_drop_cap_sys_admin) +{ + cap_t caps; + cap_value_t cap_list[1] = { CAP_SYS_ADMIN }; + + /* This test needs to start with CAP_SYS_ADMIN */ + caps = cap_get_proc(); + if (!caps) { + SKIP(return, "Cannot get capabilities"); + } + + cap_flag_value_t cap_val; + if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &cap_val) < 0) { + cap_free(caps); + SKIP(return, "Cannot check CAP_SYS_ADMIN"); + } + + if (cap_val != CAP_SET) { + cap_free(caps); + SKIP(return, "Test needs CAP_SYS_ADMIN to start"); + } + cap_free(caps); + + int pipefd[2]; + pid_t pid; + int status; + bool correct; + ssize_t count_before, count_after; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids_before[100]; + ssize_t count_before; + __u64 ns_ids_after[100]; + ssize_t count_after; + bool correct; + + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Count namespaces with CAP_SYS_ADMIN */ + count_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + + /* Drop CAP_SYS_ADMIN */ + caps = cap_get_proc(); + if (caps) { + cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR); + cap_set_flag(caps, CAP_PERMITTED, 1, cap_list, CAP_CLEAR); + cap_set_proc(caps); + cap_free(caps); + } + + /* Ensure we can't regain the capability */ + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + + /* Count namespaces without CAP_SYS_ADMIN */ + count_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + + /* Without CAP_SYS_ADMIN, we should see same or fewer namespaces */ + correct = (count_after <= count_before); + + write(pipefd[1], &correct, sizeof(correct)); + write(pipefd[1], &count_before, sizeof(count_before)); + write(pipefd[1], &count_after, sizeof(count_after)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + correct = false; + count_before = 0; + count_after = 0; + read(pipefd[0], &correct, sizeof(correct)); + read(pipefd[0], &count_before, sizeof(count_before)); + read(pipefd[0], &count_after, sizeof(count_after)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(correct); + TH_LOG("With CAP_SYS_ADMIN: %zd namespaces, without: %zd namespaces", + count_before, count_after); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_test.c b/tools/testing/selftests/namespaces/listns_test.c new file mode 100644 index 000000000000..8a95789d6a87 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_test.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test basic listns() functionality with the unified namespace tree. + * List all active namespaces globally. + */ +TEST(listns_basic_unified) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Global listing */ + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + + /* Should find at least the initial namespaces */ + ASSERT_GT(ret, 0); + TH_LOG("Found %zd active namespaces", ret); + + /* Verify all returned IDs are non-zero */ + for (ssize_t i = 0; i < ret; i++) { + ASSERT_NE(ns_ids[i], 0); + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); + } +} + +/* + * Test listns() with type filtering. + * List only network namespaces. + */ +TEST(listns_filter_by_type) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, /* Only network namespaces */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + /* Should find at least init_net */ + ASSERT_GT(ret, 0); + TH_LOG("Found %zd active network namespaces", ret); + + /* Verify we can open each namespace and it's actually a network namespace */ + for (ssize_t i = 0; i < ret && i < 5; i++) { + struct nsfs_file_handle nsfh = { + .ns_id = ns_ids[i], + .ns_type = CLONE_NEWNET, + .ns_inum = 0, + }; + struct file_handle *fh; + int fd; + + fh = (struct file_handle *)malloc(sizeof(*fh) + sizeof(nsfh)); + ASSERT_NE(fh, NULL); + fh->handle_bytes = sizeof(nsfh); + fh->handle_type = 0; + memcpy(fh->f_handle, &nsfh, sizeof(nsfh)); + + fd = open_by_handle_at(-10003, fh, O_RDONLY); + free(fh); + + if (fd >= 0) { + int ns_type; + /* Verify it's a network namespace via ioctl */ + ns_type = ioctl(fd, NS_GET_NSTYPE); + if (ns_type >= 0) { + ASSERT_EQ(ns_type, CLONE_NEWNET); + } + close(fd); + } + } +} + +/* + * Test listns() pagination. + * List namespaces in batches. + */ +TEST(listns_pagination) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 batch1[2], batch2[2]; + ssize_t ret1, ret2; + + /* Get first batch */ + ret1 = sys_listns(&req, batch1, ARRAY_SIZE(batch1), 0); + if (ret1 < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret1, 0); + + if (ret1 == 0) + SKIP(return, "No namespaces found"); + + TH_LOG("First batch: %zd namespaces", ret1); + + /* Get second batch using last ID from first batch */ + if (ret1 == ARRAY_SIZE(batch1)) { + req.ns_id = batch1[ret1 - 1]; + ret2 = sys_listns(&req, batch2, ARRAY_SIZE(batch2), 0); + ASSERT_GE(ret2, 0); + + TH_LOG("Second batch: %zd namespaces (after ns_id=%llu)", + ret2, (unsigned long long)req.ns_id); + + /* If we got more results, verify IDs are monotonically increasing */ + if (ret2 > 0) { + ASSERT_GT(batch2[0], batch1[ret1 - 1]); + TH_LOG("Pagination working: %llu > %llu", + (unsigned long long)batch2[0], + (unsigned long long)batch1[ret1 - 1]); + } + } else { + TH_LOG("All namespaces fit in first batch"); + } +} + +/* + * Test listns() with LISTNS_CURRENT_USER. + * List namespaces owned by current user namespace. + */ +TEST(listns_current_user) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + /* Should find at least the initial namespaces if we're in init_user_ns */ + TH_LOG("Found %zd namespaces owned by current user namespace", ret); + + for (ssize_t i = 0; i < ret; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); +} + +/* + * Test that listns() only returns active namespaces. + * Create a namespace, let it become inactive, verify it's not listed. + */ +TEST(listns_only_active) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[100], ns_ids_after[100]; + ssize_t ret_before, ret_after; + int pipefd[2]; + pid_t pid; + __u64 new_ns_id = 0; + int status; + + /* Get initial list */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret_before, 0); + + TH_LOG("Before: %zd active network namespaces", ret_before); + + /* Create a new namespace in a child process and get its ID */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 ns_id; + + close(pipefd[0]); + + /* Create new network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get its ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &ns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send ID to parent */ + write(pipefd[1], &ns_id, sizeof(ns_id)); + close(pipefd[1]); + + /* Keep namespace active briefly */ + usleep(100000); + exit(0); + } + + /* Parent reads the new namespace ID */ + { + int bytes; + + close(pipefd[1]); + bytes = read(pipefd[0], &new_ns_id, sizeof(new_ns_id)); + close(pipefd[0]); + + if (bytes == sizeof(new_ns_id)) { + __u64 ns_ids_during[100]; + int ret_during; + + TH_LOG("Child created namespace with ID %llu", (unsigned long long)new_ns_id); + + /* List namespaces while child is still alive - should see new one */ + ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0); + ASSERT_GE(ret_during, 0); + TH_LOG("During: %d active network namespaces", ret_during); + + /* Should have more namespaces than before */ + ASSERT_GE(ret_during, ret_before); + } + } + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + + /* Give time for namespace to become inactive */ + usleep(100000); + + /* List namespaces after child exits - should not see new one */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + TH_LOG("After: %zd active network namespaces", ret_after); + + /* Verify the new namespace ID is not in the after list */ + if (new_ns_id != 0) { + bool found = false; + + for (ssize_t i = 0; i < ret_after; i++) { + if (ns_ids_after[i] == new_ns_id) { + found = true; + break; + } + } + ASSERT_FALSE(found); + } +} + +/* + * Test listns() with specific user namespace ID. + * Create a user namespace and list namespaces it owns. + */ +TEST(listns_specific_userns) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, /* Will be filled with created userns ID */ + }; + __u64 ns_ids[100]; + int sv[2]; + pid_t pid; + int status; + __u64 user_ns_id = 0; + int bytes; + ssize_t ret; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 ns_id; + char buf; + + close(sv[0]); + + /* Create new user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Send ID to parent */ + if (write(sv[1], &ns_id, sizeof(ns_id)) != sizeof(ns_id)) { + close(sv[1]); + exit(1); + } + + /* Create some namespaces owned by this user namespace */ + unshare(CLONE_NEWNET); + unshare(CLONE_NEWUTS); + + /* Wait for parent signal */ + if (read(sv[1], &buf, 1) != 1) { + close(sv[1]); + exit(1); + } + close(sv[1]); + exit(0); + } + + /* Parent */ + close(sv[1]); + bytes = read(sv[0], &user_ns_id, sizeof(user_ns_id)); + + if (bytes != sizeof(user_ns_id)) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get user namespace ID from child"); + } + + TH_LOG("Child created user namespace with ID %llu", (unsigned long long)user_ns_id); + + /* List namespaces owned by this user namespace */ + req.user_ns_id = user_ns_id; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + if (ret < 0) { + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) { + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + TH_LOG("Found %zd namespaces owned by user namespace %llu", ret, + (unsigned long long)user_ns_id); + + /* Should find at least the network and UTS namespaces we created */ + if (ret > 0) { + for (ssize_t i = 0; i < ret && i < 10; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); + } + + /* Signal child to exit */ + if (write(sv[0], "X", 1) != 1) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + ASSERT_TRUE(false); + } + close(sv[0]); + waitpid(pid, &status, 0); +} + +/* + * Test listns() with multiple namespace types filter. + */ +TEST(listns_multiple_types) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUTS, /* Network and UTS */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + TH_LOG("Found %zd active network/UTS namespaces", ret); + + for (ssize_t i = 0; i < ret; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); +} + +/* + * Test that hierarchical active reference propagation keeps parent + * user namespaces visible in listns(). + */ +TEST(listns_hierarchical_visibility) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 parent_ns_id = 0, child_ns_id = 0; + int sv[2]; + pid_t pid; + int status; + int bytes; + __u64 ns_ids[100]; + ssize_t ret; + bool found_parent, found_child; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + char buf; + + close(sv[0]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Send both IDs to parent */ + if (write(sv[1], &parent_ns_id, sizeof(parent_ns_id)) != sizeof(parent_ns_id)) { + close(sv[1]); + exit(1); + } + if (write(sv[1], &child_ns_id, sizeof(child_ns_id)) != sizeof(child_ns_id)) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal */ + if (read(sv[1], &buf, 1) != 1) { + close(sv[1]); + exit(1); + } + close(sv[1]); + exit(0); + } + + /* Parent */ + close(sv[1]); + + /* Read both namespace IDs */ + bytes = read(sv[0], &parent_ns_id, sizeof(parent_ns_id)); + bytes += read(sv[0], &child_ns_id, sizeof(child_ns_id)); + + if (bytes != (int)(2 * sizeof(__u64))) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace IDs from child"); + } + + TH_LOG("Parent user namespace ID: %llu", (unsigned long long)parent_ns_id); + TH_LOG("Child user namespace ID: %llu", (unsigned long long)child_ns_id); + + /* List all user namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + if (ret < 0 && errno == ENOSYS) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "listns() not supported"); + } + + ASSERT_GE(ret, 0); + TH_LOG("Found %zd active user namespaces", ret); + + /* Both parent and child should be visible (active due to child process) */ + found_parent = false; + found_child = false; + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == parent_ns_id) + found_parent = true; + if (ns_ids[i] == child_ns_id) + found_child = true; + } + + TH_LOG("Parent namespace %s, child namespace %s", + found_parent ? "found" : "NOT FOUND", + found_child ? "found" : "NOT FOUND"); + + ASSERT_TRUE(found_child); + /* With hierarchical propagation, parent should also be active */ + ASSERT_TRUE(found_parent); + + /* Signal child to exit */ + if (write(sv[0], "X", 1) != 1) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + ASSERT_TRUE(false); + } + close(sv[0]); + waitpid(pid, &status, 0); +} + +/* + * Test error cases for listns(). + */ +TEST(listns_error_cases) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[10]; + int ret; + + /* Test with invalid flags */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0xFFFF); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, EINVAL); + } + + /* Test with NULL ns_ids array */ + ret = sys_listns(&req, NULL, 10, 0); + ASSERT_LT(ret, 0); + + /* Test with invalid spare field */ + req.spare = 1; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, EINVAL); + } + req.spare = 0; + + /* Test with huge nr_ns_ids */ + ret = sys_listns(&req, ns_ids, 2000000, 0); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/ns_active_ref_test.c b/tools/testing/selftests/namespaces/ns_active_ref_test.c new file mode 100644 index 000000000000..093268f0efaa --- /dev/null +++ b/tools/testing/selftests/namespaces/ns_active_ref_test.c @@ -0,0 +1,2672 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <pthread.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ +#endif + +#ifndef FILEID_NSFS +#define FILEID_NSFS 0xf1 +#endif + +/* + * Test that initial namespaces can be reopened via file handle. + * Initial namespaces should have active ref count of 1 from boot. + */ +TEST(init_ns_always_active) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd1, fd2; + struct stat st1, st2; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open initial network namespace */ + fd1 = open("/proc/1/ns/net", O_RDONLY); + ASSERT_GE(fd1, 0); + + /* Get file handle for initial namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd1, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(fd1); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + + /* Close the namespace fd */ + close(fd1); + + /* Try to reopen via file handle - should succeed since init ns is always active */ + fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd2 < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); + return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd2, 0); + + /* Verify we opened the same namespace */ + fd1 = open("/proc/1/ns/net", O_RDONLY); + ASSERT_GE(fd1, 0); + ASSERT_EQ(fstat(fd1, &st1), 0); + ASSERT_EQ(fstat(fd2, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(fd1); + close(fd2); + free(handle); +} + +/* + * Test namespace lifecycle: create a namespace in a child process, + * get a file handle while it's active, then try to reopen after + * the process exits (namespace becomes inactive). + */ +TEST(ns_inactive_after_exit) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + /* Create pipe for passing file handle from child */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Open our new namespace */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get file handle for the namespace */ + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + + /* Exit - namespace should become inactive */ + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + /* Read file handle from child */ + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Try to reopen namespace - should fail with ENOENT since it's inactive */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + /* Should fail with ENOENT (namespace inactive) or ESTALE */ + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that a namespace remains active while a process is using it, + * even after the creating process exits. + */ +TEST(ns_active_with_multiple_processes) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + int syncpipe[2]; + pid_t pid1, pid2; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char sync_byte; + + /* Create pipes for communication */ + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + pid1 = fork(); + ASSERT_GE(pid1, 0); + + if (pid1 == 0) { + /* First child - creates namespace */ + close(pipefd[0]); + close(syncpipe[1]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + /* Open and get handle */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + + /* Wait for signal before exiting */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + /* Parent reads handle */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + ASSERT_GT(ret, 0); + + handle = (struct file_handle *)buf; + + /* Create second child that will keep namespace active */ + pid2 = fork(); + ASSERT_GE(pid2, 0); + + if (pid2 == 0) { + /* Second child - reopens the namespace */ + close(syncpipe[0]); + close(syncpipe[1]); + + /* Open the namespace via handle */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0) { + exit(1); + } + + /* Join the namespace */ + ret = setns(fd, CLONE_NEWNET); + close(fd); + if (ret < 0) { + exit(1); + } + + /* Sleep to keep namespace active */ + sleep(1); + exit(0); + } + + /* Let second child enter the namespace */ + usleep(100000); /* 100ms */ + + /* Signal first child to exit */ + close(syncpipe[0]); + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for first child */ + waitpid(pid1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* Namespace should still be active because second child is using it */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(fd, 0); + close(fd); + + /* Wait for second child */ + waitpid(pid2, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); +} + +/* + * Test user namespace active ref tracking via credential lifecycle + */ +TEST(userns_active_ref_lifecycle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new user namespace */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Set up uid/gid mappings */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd >= 0 && gid_map_fd >= 0 && setgroups_fd >= 0) { + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + } + + /* Get file handle */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Namespace should be inactive after all tasks exit */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test PID namespace active ref tracking + */ +TEST(pidns_active_ref_lifecycle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new PID namespace */ + ret = unshare(CLONE_NEWPID); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Fork to actually enter the PID namespace */ + pid_t child = fork(); + if (child < 0) { + close(pipefd[1]); + exit(1); + } + + if (child == 0) { + /* Grandchild - in new PID namespace */ + fd = open("/proc/self/ns/pid", O_RDONLY); + if (fd < 0) { + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + exit(1); + } + + /* Send handle to grandparent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child, NULL, 0); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Namespace should be inactive after all processes exit */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that an open file descriptor keeps a namespace active. + * Even after the creating process exits, the namespace should remain + * active as long as an fd is held open. + */ +TEST(ns_fd_keeps_active) +{ + struct file_handle *handle; + int mount_id; + int ret; + int nsfd; + int pipe_child_ready[2]; + int pipe_parent_ready[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char sync_byte; + char proc_path[64]; + + ASSERT_EQ(pipe(pipe_child_ready), 0); + ASSERT_EQ(pipe(pipe_parent_ready), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipe_child_ready[0]); + close(pipe_parent_ready[1]); + + TH_LOG("Child: creating new network namespace"); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + TH_LOG("Child: unshare(CLONE_NEWNET) failed: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: network namespace created successfully"); + + /* Get file handle for the namespace */ + nsfd = open("/proc/self/ns/net", O_RDONLY); + if (nsfd < 0) { + TH_LOG("Child: failed to open /proc/self/ns/net: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: opened namespace fd %d", nsfd); + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(nsfd, "", handle, &mount_id, AT_EMPTY_PATH); + close(nsfd); + + if (ret < 0) { + TH_LOG("Child: name_to_handle_at failed: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: got file handle (bytes=%u)", handle->handle_bytes); + + /* Send file handle to parent */ + ret = write(pipe_child_ready[1], buf, sizeof(*handle) + handle->handle_bytes); + TH_LOG("Child: sent %d bytes of file handle to parent", ret); + close(pipe_child_ready[1]); + + /* Wait for parent to open the fd */ + TH_LOG("Child: waiting for parent to open fd"); + ret = read(pipe_parent_ready[0], &sync_byte, 1); + close(pipe_parent_ready[0]); + + TH_LOG("Child: parent signaled (read %d bytes), exiting now", ret); + /* Exit - namespace should stay active because parent holds fd */ + exit(0); + } + + /* Parent process */ + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + + TH_LOG("Parent: reading file handle from child"); + + /* Read file handle from child */ + ret = read(pipe_child_ready[0], buf, sizeof(buf)); + close(pipe_child_ready[0]); + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + TH_LOG("Parent: received %d bytes, handle size=%u", ret, handle->handle_bytes); + + /* Open the child's namespace while it's still alive */ + snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/net", pid); + TH_LOG("Parent: opening child's namespace at %s", proc_path); + nsfd = open(proc_path, O_RDONLY); + if (nsfd < 0) { + TH_LOG("Parent: failed to open %s: %s", proc_path, strerror(errno)); + close(pipe_parent_ready[1]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child's namespace"); + } + + TH_LOG("Parent: opened child's namespace, got fd %d", nsfd); + + /* Signal child that we have the fd */ + sync_byte = 'G'; + write(pipe_parent_ready[1], &sync_byte, 1); + close(pipe_parent_ready[1]); + TH_LOG("Parent: signaled child that we have the fd"); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + TH_LOG("Child exited, parent holds fd %d to namespace", nsfd); + + /* + * Namespace should still be ACTIVE because we hold an fd. + * We should be able to reopen it via file handle. + */ + TH_LOG("Attempting to reopen namespace via file handle (should succeed - fd held)"); + int fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(fd2, 0); + + TH_LOG("Successfully reopened namespace via file handle, got fd %d", fd2); + + /* Verify it's the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(nsfd, &st1), 0); + ASSERT_EQ(fstat(fd2, &st2), 0); + TH_LOG("Namespace inodes: nsfd=%lu, fd2=%lu", st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_ino, st2.st_ino); + close(fd2); + + /* Now close the fd - namespace should become inactive */ + TH_LOG("Closing fd %d - namespace should become inactive", nsfd); + close(nsfd); + + /* Now reopening should fail - namespace is inactive */ + TH_LOG("Attempting to reopen namespace via file handle (should fail - inactive)"); + fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd2, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Reopen failed as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test hierarchical active reference propagation. + * When a child namespace is active, its owning user namespace should also + * be active automatically due to hierarchical active reference propagation. + * This ensures parents are always reachable when children are active. + */ +TEST(ns_parent_always_reachable) +{ + struct file_handle *parent_handle, *child_handle; + int ret; + int child_nsfd; + int pipefd[2]; + pid_t pid; + int status; + __u64 parent_id, child_id; + char parent_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ]; + char child_buf[sizeof(*child_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + TH_LOG("Child: creating parent user namespace and setting up mappings"); + + /* Create parent user namespace with mappings */ + ret = setup_userns(); + if (ret < 0) { + TH_LOG("Child: setup_userns() for parent failed: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: parent user namespace created, now uid=%d gid=%d", getuid(), getgid()); + + /* Get namespace ID for parent user namespace */ + int parent_fd = open("/proc/self/ns/user", O_RDONLY); + if (parent_fd < 0) { + TH_LOG("Child: failed to open parent /proc/self/ns/user: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: opened parent userns fd %d", parent_fd); + + if (ioctl(parent_fd, NS_GET_ID, &parent_id) < 0) { + TH_LOG("Child: NS_GET_ID for parent failed: %s", strerror(errno)); + close(parent_fd); + close(pipefd[1]); + exit(1); + } + close(parent_fd); + + TH_LOG("Child: got parent namespace ID %llu", (unsigned long long)parent_id); + + /* Create child user namespace within parent */ + TH_LOG("Child: creating nested child user namespace"); + ret = setup_userns(); + if (ret < 0) { + TH_LOG("Child: setup_userns() for child failed: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: nested child user namespace created, uid=%d gid=%d", getuid(), getgid()); + + /* Get namespace ID for child user namespace */ + int child_fd = open("/proc/self/ns/user", O_RDONLY); + if (child_fd < 0) { + TH_LOG("Child: failed to open child /proc/self/ns/user: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: opened child userns fd %d", child_fd); + + if (ioctl(child_fd, NS_GET_ID, &child_id) < 0) { + TH_LOG("Child: NS_GET_ID for child failed: %s", strerror(errno)); + close(child_fd); + close(pipefd[1]); + exit(1); + } + close(child_fd); + + TH_LOG("Child: got child namespace ID %llu", (unsigned long long)child_id); + + /* Send both namespace IDs to parent */ + TH_LOG("Child: sending both namespace IDs to parent"); + write(pipefd[1], &parent_id, sizeof(parent_id)); + write(pipefd[1], &child_id, sizeof(child_id)); + close(pipefd[1]); + + TH_LOG("Child: exiting - parent userns should become inactive"); + /* Exit - parent user namespace should become inactive */ + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + TH_LOG("Parent: reading both namespace IDs from child"); + + /* Read both namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &parent_id, sizeof(parent_id)); + if (ret != sizeof(parent_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID from child"); + } + + ret = read(pipefd[0], &child_id, sizeof(child_id)); + close(pipefd[0]); + if (ret != sizeof(child_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read child namespace ID from child"); + } + + TH_LOG("Parent: received parent_id=%llu, child_id=%llu", + (unsigned long long)parent_id, (unsigned long long)child_id); + + /* Construct file handles from namespace IDs */ + parent_handle = (struct file_handle *)parent_buf; + parent_handle->handle_bytes = sizeof(struct nsfs_file_handle); + parent_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *parent_fh = (struct nsfs_file_handle *)parent_handle->f_handle; + parent_fh->ns_id = parent_id; + parent_fh->ns_type = 0; + parent_fh->ns_inum = 0; + + child_handle = (struct file_handle *)child_buf; + child_handle->handle_bytes = sizeof(struct nsfs_file_handle); + child_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *child_fh = (struct nsfs_file_handle *)child_handle->f_handle; + child_fh->ns_id = child_id; + child_fh->ns_type = 0; + child_fh->ns_inum = 0; + + TH_LOG("Parent: opening child namespace BEFORE child exits"); + + /* Open child namespace while child is still alive to keep it active */ + child_nsfd = open_by_handle_at(FD_NSFS_ROOT, child_handle, O_RDONLY); + if (child_nsfd < 0) { + TH_LOG("Failed to open child namespace: %s (errno=%d)", strerror(errno), errno); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespace"); + } + + TH_LOG("Opened child namespace fd %d", child_nsfd); + + /* Now wait for child to exit */ + TH_LOG("Parent: waiting for child to exit"); + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + TH_LOG("Child process exited, parent holds fd to child namespace"); + + /* + * With hierarchical active reference propagation: + * Since the child namespace is active (parent process holds fd), + * the parent user namespace should ALSO be active automatically. + * This is because when we took an active reference on the child, + * it propagated up to the owning user namespace. + */ + TH_LOG("Attempting to reopen parent namespace (should SUCCEED - hierarchical propagation)"); + int parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(parent_fd, 0); + + TH_LOG("SUCCESS: Parent namespace is active (fd=%d) due to active child", parent_fd); + + /* Verify we can also get parent via NS_GET_USERNS */ + TH_LOG("Verifying NS_GET_USERNS also works"); + int parent_fd2 = ioctl(child_nsfd, NS_GET_USERNS); + if (parent_fd2 < 0) { + close(parent_fd); + close(child_nsfd); + TH_LOG("NS_GET_USERNS failed: %s (errno=%d)", strerror(errno), errno); + SKIP(return, "NS_GET_USERNS not supported or failed"); + } + + TH_LOG("NS_GET_USERNS succeeded, got parent fd %d", parent_fd2); + + /* Verify both methods give us the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(parent_fd, &st1), 0); + ASSERT_EQ(fstat(parent_fd2, &st2), 0); + TH_LOG("Parent namespace inodes: parent_fd=%lu, parent_fd2=%lu", st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + /* + * Close child fd - parent should remain active because we still + * hold direct references to it (parent_fd and parent_fd2). + */ + TH_LOG("Closing child fd - parent should remain active (direct refs held)"); + close(child_nsfd); + + /* Parent should still be openable */ + TH_LOG("Verifying parent still active via file handle"); + int parent_fd3 = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(parent_fd3, 0); + close(parent_fd3); + + TH_LOG("Closing all fds to parent namespace"); + close(parent_fd); + close(parent_fd2); + + /* Both should now be inactive */ + TH_LOG("Attempting to reopen parent (should fail - inactive, no refs)"); + parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_LT(parent_fd, 0); + TH_LOG("Parent inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that bind mounts keep namespaces in the tree even when inactive + */ +TEST(ns_bind_mount_keeps_in_tree) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char tmpfile[] = "/tmp/ns-test-XXXXXX"; + int tmpfd; + + /* Create temporary file for bind mount */ + tmpfd = mkstemp(tmpfile); + if (tmpfd < 0) { + SKIP(return, "Cannot create temporary file"); + } + close(tmpfd); + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Unshare mount namespace and make mounts private to avoid propagation */ + ret = unshare(CLONE_NEWNS); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + ret = mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Bind mount the namespace */ + ret = mount("/proc/self/ns/net", tmpfile, NULL, MS_BIND, NULL); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Get file handle */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + umount(tmpfile); + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + umount(tmpfile); + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* + * Namespace should be inactive but still in tree due to bind mount. + * Reopening should fail with ENOENT (inactive) not ESTALE (not in tree). + */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + /* Should be ENOENT (inactive) since bind mount keeps it in tree */ + if (errno != ENOENT && errno != ESTALE) { + TH_LOG("Unexpected error: %d", errno); + } + + /* Cleanup */ + umount(tmpfile); + unlink(tmpfile); +} + +/* + * Test multi-level hierarchy (3+ levels deep). + * Grandparent → Parent → Child + * When child is active, both parent AND grandparent should be active. + */ +TEST(ns_multilevel_hierarchy) +{ + struct file_handle *gp_handle, *p_handle, *c_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 gp_id, p_id, c_id; + char gp_buf[sizeof(*gp_handle) + MAX_HANDLE_SZ]; + char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ]; + char c_buf[sizeof(*c_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create grandparent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int gp_fd = open("/proc/self/ns/user", O_RDONLY); + if (gp_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(gp_fd, NS_GET_ID, &gp_id) < 0) { + close(gp_fd); + close(pipefd[1]); + exit(1); + } + close(gp_fd); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int c_fd = open("/proc/self/ns/user", O_RDONLY); + if (c_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c_fd, NS_GET_ID, &c_id) < 0) { + close(c_fd); + close(pipefd[1]); + exit(1); + } + close(c_fd); + + /* Send all three namespace IDs */ + write(pipefd[1], &gp_id, sizeof(gp_id)); + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &c_id, sizeof(c_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &gp_id, sizeof(gp_id)); + if (ret != sizeof(gp_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read grandparent namespace ID from child"); + } + + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID from child"); + } + + ret = read(pipefd[0], &c_id, sizeof(c_id)); + close(pipefd[0]); + if (ret != sizeof(c_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read child namespace ID from child"); + } + + /* Construct file handles from namespace IDs */ + gp_handle = (struct file_handle *)gp_buf; + gp_handle->handle_bytes = sizeof(struct nsfs_file_handle); + gp_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *gp_fh = (struct nsfs_file_handle *)gp_handle->f_handle; + gp_fh->ns_id = gp_id; + gp_fh->ns_type = 0; + gp_fh->ns_inum = 0; + + p_handle = (struct file_handle *)p_buf; + p_handle->handle_bytes = sizeof(struct nsfs_file_handle); + p_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + c_handle = (struct file_handle *)c_buf; + c_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c_fh = (struct nsfs_file_handle *)c_handle->f_handle; + c_fh->ns_id = c_id; + c_fh->ns_type = 0; + c_fh->ns_inum = 0; + + /* Open child before process exits */ + int c_fd = open_by_handle_at(FD_NSFS_ROOT, c_handle, O_RDONLY); + if (c_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespace"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * With 3-level hierarchy and child active: + * - Child is active (we hold fd) + * - Parent should be active (propagated from child) + * - Grandparent should be active (propagated from parent) + */ + TH_LOG("Testing parent active when child is active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + + TH_LOG("Testing grandparent active when child is active"); + int gp_fd = open_by_handle_at(FD_NSFS_ROOT, gp_handle, O_RDONLY); + ASSERT_GE(gp_fd, 0); + + close(c_fd); + close(p_fd); + close(gp_fd); +} + +/* + * Test multiple children sharing same parent. + * Parent should stay active as long as ANY child is active. + */ +TEST(ns_multiple_children_same_parent) +{ + struct file_handle *p_handle, *c1_handle, *c2_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 p_id, c1_id, c2_id; + char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ]; + char c1_buf[sizeof(*c1_handle) + MAX_HANDLE_SZ]; + char c2_buf[sizeof(*c2_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create first child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int c1_fd = open("/proc/self/ns/user", O_RDONLY); + if (c1_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c1_fd, NS_GET_ID, &c1_id) < 0) { + close(c1_fd); + close(pipefd[1]); + exit(1); + } + close(c1_fd); + + /* Return to parent user namespace and create second child */ + /* We can't actually do this easily, so let's create a sibling namespace + * by creating a network namespace instead */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int c2_fd = open("/proc/self/ns/net", O_RDONLY); + if (c2_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c2_fd, NS_GET_ID, &c2_id) < 0) { + close(c2_fd); + close(pipefd[1]); + exit(1); + } + close(c2_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &c1_id, sizeof(c1_id)); + write(pipefd[1], &c2_id, sizeof(c2_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID"); + } + + ret = read(pipefd[0], &c1_id, sizeof(c1_id)); + if (ret != sizeof(c1_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read first child namespace ID"); + } + + ret = read(pipefd[0], &c2_id, sizeof(c2_id)); + close(pipefd[0]); + if (ret != sizeof(c2_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read second child namespace ID"); + } + + /* Construct file handles from namespace IDs */ + p_handle = (struct file_handle *)p_buf; + p_handle->handle_bytes = sizeof(struct nsfs_file_handle); + p_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + c1_handle = (struct file_handle *)c1_buf; + c1_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c1_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c1_fh = (struct nsfs_file_handle *)c1_handle->f_handle; + c1_fh->ns_id = c1_id; + c1_fh->ns_type = 0; + c1_fh->ns_inum = 0; + + c2_handle = (struct file_handle *)c2_buf; + c2_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c2_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c2_fh = (struct nsfs_file_handle *)c2_handle->f_handle; + c2_fh->ns_id = c2_id; + c2_fh->ns_type = 0; + c2_fh->ns_inum = 0; + + /* Open both children before process exits */ + int c1_fd = open_by_handle_at(FD_NSFS_ROOT, c1_handle, O_RDONLY); + int c2_fd = open_by_handle_at(FD_NSFS_ROOT, c2_handle, O_RDONLY); + + if (c1_fd < 0 || c2_fd < 0) { + if (c1_fd >= 0) close(c1_fd); + if (c2_fd >= 0) close(c2_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Parent should be active (both children active) */ + TH_LOG("Both children active - parent should be active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close first child - parent should STILL be active */ + TH_LOG("Closing first child - parent should still be active"); + close(c1_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close second child - NOW parent should become inactive */ + TH_LOG("Closing second child - parent should become inactive"); + close(c2_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_LT(p_fd, 0); +} + +/* + * Test that different namespace types with same owner all contribute + * active references to the owning user namespace. + */ +TEST(ns_different_types_same_owner) +{ + struct file_handle *u_handle, *n_handle, *ut_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 u_id, n_id, ut_id; + char u_buf[sizeof(*u_handle) + MAX_HANDLE_SZ]; + char n_buf[sizeof(*n_handle) + MAX_HANDLE_SZ]; + char ut_buf[sizeof(*ut_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int u_fd = open("/proc/self/ns/user", O_RDONLY); + if (u_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) { + close(u_fd); + close(pipefd[1]); + exit(1); + } + close(u_fd); + + /* Create network namespace (owned by user namespace) */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int n_fd = open("/proc/self/ns/net", O_RDONLY); + if (n_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) { + close(n_fd); + close(pipefd[1]); + exit(1); + } + close(n_fd); + + /* Create UTS namespace (also owned by user namespace) */ + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + int ut_fd = open("/proc/self/ns/uts", O_RDONLY); + if (ut_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) { + close(ut_fd); + close(pipefd[1]); + exit(1); + } + close(ut_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &u_id, sizeof(u_id)); + write(pipefd[1], &n_id, sizeof(n_id)); + write(pipefd[1], &ut_id, sizeof(ut_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &u_id, sizeof(u_id)); + if (ret != sizeof(u_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID"); + } + + ret = read(pipefd[0], &n_id, sizeof(n_id)); + if (ret != sizeof(n_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + ret = read(pipefd[0], &ut_id, sizeof(ut_id)); + close(pipefd[0]); + if (ret != sizeof(ut_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read UTS namespace ID"); + } + + /* Construct file handles from namespace IDs */ + u_handle = (struct file_handle *)u_buf; + u_handle->handle_bytes = sizeof(struct nsfs_file_handle); + u_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)u_handle->f_handle; + u_fh->ns_id = u_id; + u_fh->ns_type = 0; + u_fh->ns_inum = 0; + + n_handle = (struct file_handle *)n_buf; + n_handle->handle_bytes = sizeof(struct nsfs_file_handle); + n_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)n_handle->f_handle; + n_fh->ns_id = n_id; + n_fh->ns_type = 0; + n_fh->ns_inum = 0; + + ut_handle = (struct file_handle *)ut_buf; + ut_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ut_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)ut_handle->f_handle; + ut_fh->ns_id = ut_id; + ut_fh->ns_type = 0; + ut_fh->ns_inum = 0; + + /* Open both non-user namespaces before process exits */ + int n_fd = open_by_handle_at(FD_NSFS_ROOT, n_handle, O_RDONLY); + int ut_fd = open_by_handle_at(FD_NSFS_ROOT, ut_handle, O_RDONLY); + + if (n_fd < 0 || ut_fd < 0) { + if (n_fd >= 0) close(n_fd); + if (ut_fd >= 0) close(ut_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * Both network and UTS namespaces are active. + * User namespace should be active (gets 2 active refs). + */ + TH_LOG("Both net and uts active - user namespace should be active"); + int u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close network namespace - user namespace should STILL be active */ + TH_LOG("Closing network ns - user ns should still be active (uts still active)"); + close(n_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close UTS namespace - user namespace should become inactive */ + TH_LOG("Closing uts ns - user ns should become inactive"); + close(ut_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_LT(u_fd, 0); +} + +/* + * Test hierarchical propagation with deep namespace hierarchy. + * Create: init_user_ns -> user_A -> user_B -> net_ns + * When net_ns is active, both user_A and user_B should be active. + * This verifies the conditional recursion in __ns_ref_active_put() works. + */ +TEST(ns_deep_hierarchy_propagation) +{ + struct file_handle *ua_handle, *ub_handle, *net_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 ua_id, ub_id, net_id; + char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ]; + char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ]; + char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user_A -> user_B -> net hierarchy */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ua_fd = open("/proc/self/ns/user", O_RDONLY); + if (ua_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) { + close(ua_fd); + close(pipefd[1]); + exit(1); + } + close(ua_fd); + + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ub_fd = open("/proc/self/ns/user", O_RDONLY); + if (ub_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) { + close(ub_fd); + close(pipefd[1]); + exit(1); + } + close(ub_fd); + + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int net_fd = open("/proc/self/ns/net", O_RDONLY); + if (net_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) { + close(net_fd); + close(pipefd[1]); + exit(1); + } + close(net_fd); + + /* Send all three namespace IDs */ + write(pipefd[1], &ua_id, sizeof(ua_id)); + write(pipefd[1], &ub_id, sizeof(ub_id)); + write(pipefd[1], &net_id, sizeof(net_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &ua_id, sizeof(ua_id)); + if (ret != sizeof(ua_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_A namespace ID"); + } + + ret = read(pipefd[0], &ub_id, sizeof(ub_id)); + if (ret != sizeof(ub_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_B namespace ID"); + } + + ret = read(pipefd[0], &net_id, sizeof(net_id)); + close(pipefd[0]); + if (ret != sizeof(net_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + /* Construct file handles from namespace IDs */ + ua_handle = (struct file_handle *)ua_buf; + ua_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ua_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle; + ua_fh->ns_id = ua_id; + ua_fh->ns_type = 0; + ua_fh->ns_inum = 0; + + ub_handle = (struct file_handle *)ub_buf; + ub_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ub_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle; + ub_fh->ns_id = ub_id; + ub_fh->ns_type = 0; + ub_fh->ns_inum = 0; + + net_handle = (struct file_handle *)net_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle; + net_fh->ns_id = net_id; + net_fh->ns_type = 0; + net_fh->ns_inum = 0; + + /* Open net_ns before child exits to keep it active */ + int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + if (net_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open network namespace"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* With net_ns active, both user_A and user_B should be active */ + TH_LOG("Testing user_B active (net_ns active causes propagation)"); + int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + ASSERT_GE(ub_fd, 0); + + TH_LOG("Testing user_A active (propagated through user_B)"); + int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd, 0); + + /* Close net_ns - user_B should stay active (we hold direct ref) */ + TH_LOG("Closing net_ns, user_B should remain active (direct ref held)"); + close(net_fd); + int ub_fd2 = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + ASSERT_GE(ub_fd2, 0); + close(ub_fd2); + + /* Close user_B - user_A should stay active (we hold direct ref) */ + TH_LOG("Closing user_B, user_A should remain active (direct ref held)"); + close(ub_fd); + int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd2, 0); + close(ua_fd2); + + /* Close user_A - everything should become inactive */ + TH_LOG("Closing user_A, all should become inactive"); + close(ua_fd); + + /* All should now be inactive */ + ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_LT(ua_fd, 0); +} + +/* + * Test that parent stays active as long as ANY child is active. + * Create parent user namespace with two child net namespaces. + * Parent should remain active until BOTH children are inactive. + */ +TEST(ns_parent_multiple_children_refcount) +{ + struct file_handle *parent_handle, *net1_handle, *net2_handle; + int ret, pipefd[2], syncpipe[2]; + pid_t pid; + int status; + __u64 p_id, n1_id, n2_id; + char p_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ]; + char n1_buf[sizeof(*net1_handle) + MAX_HANDLE_SZ]; + char n2_buf[sizeof(*net2_handle) + MAX_HANDLE_SZ]; + char sync_byte; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + close(syncpipe[1]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create first network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + int n1_fd = open("/proc/self/ns/net", O_RDONLY); + if (n1_fd < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + if (ioctl(n1_fd, NS_GET_ID, &n1_id) < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + /* Keep n1_fd open so first namespace stays active */ + + /* Create second network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + int n2_fd = open("/proc/self/ns/net", O_RDONLY); + if (n2_fd < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + if (ioctl(n2_fd, NS_GET_ID, &n2_id) < 0) { + close(n1_fd); + close(n2_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + /* Keep both n1_fd and n2_fd open */ + + /* Send all namespace IDs */ + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &n1_id, sizeof(n1_id)); + write(pipefd[1], &n2_id, sizeof(n2_id)); + close(pipefd[1]); + + /* Wait for parent to signal before exiting */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + close(pipefd[1]); + close(syncpipe[0]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID"); + } + + ret = read(pipefd[0], &n1_id, sizeof(n1_id)); + if (ret != sizeof(n1_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read first network namespace ID"); + } + + ret = read(pipefd[0], &n2_id, sizeof(n2_id)); + close(pipefd[0]); + if (ret != sizeof(n2_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read second network namespace ID"); + } + + /* Construct file handles from namespace IDs */ + parent_handle = (struct file_handle *)p_buf; + parent_handle->handle_bytes = sizeof(struct nsfs_file_handle); + parent_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)parent_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + net1_handle = (struct file_handle *)n1_buf; + net1_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net1_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n1_fh = (struct nsfs_file_handle *)net1_handle->f_handle; + n1_fh->ns_id = n1_id; + n1_fh->ns_type = 0; + n1_fh->ns_inum = 0; + + net2_handle = (struct file_handle *)n2_buf; + net2_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net2_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n2_fh = (struct nsfs_file_handle *)net2_handle->f_handle; + n2_fh->ns_id = n2_id; + n2_fh->ns_type = 0; + n2_fh->ns_inum = 0; + + /* Open both net namespaces while child is still alive */ + int n1_fd = open_by_handle_at(FD_NSFS_ROOT, net1_handle, O_RDONLY); + int n2_fd = open_by_handle_at(FD_NSFS_ROOT, net2_handle, O_RDONLY); + if (n1_fd < 0 || n2_fd < 0) { + if (n1_fd >= 0) close(n1_fd); + if (n2_fd >= 0) close(n2_fd); + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open net namespaces"); + } + + /* Signal child that we have opened the namespaces */ + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Parent should be active (has 2 active children) */ + TH_LOG("Both net namespaces active - parent should be active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close first net namespace - parent should STILL be active */ + TH_LOG("Closing first net ns - parent should still be active"); + close(n1_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close second net namespace - parent should become inactive */ + TH_LOG("Closing second net ns - parent should become inactive"); + close(n2_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_LT(p_fd, 0); +} + +/* + * Test that user namespace as a child also propagates correctly. + * Create user_A -> user_B, verify when user_B is active that user_A + * is also active. This is different from non-user namespace children. + */ +TEST(ns_userns_child_propagation) +{ + struct file_handle *ua_handle, *ub_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 ua_id, ub_id; + char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ]; + char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user_A */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ua_fd = open("/proc/self/ns/user", O_RDONLY); + if (ua_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) { + close(ua_fd); + close(pipefd[1]); + exit(1); + } + close(ua_fd); + + /* Create user_B (child of user_A) */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ub_fd = open("/proc/self/ns/user", O_RDONLY); + if (ub_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) { + close(ub_fd); + close(pipefd[1]); + exit(1); + } + close(ub_fd); + + /* Send both namespace IDs */ + write(pipefd[1], &ua_id, sizeof(ua_id)); + write(pipefd[1], &ub_id, sizeof(ub_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read both namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &ua_id, sizeof(ua_id)); + if (ret != sizeof(ua_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_A namespace ID"); + } + + ret = read(pipefd[0], &ub_id, sizeof(ub_id)); + close(pipefd[0]); + if (ret != sizeof(ub_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_B namespace ID"); + } + + /* Construct file handles from namespace IDs */ + ua_handle = (struct file_handle *)ua_buf; + ua_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ua_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle; + ua_fh->ns_id = ua_id; + ua_fh->ns_type = 0; + ua_fh->ns_inum = 0; + + ub_handle = (struct file_handle *)ub_buf; + ub_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ub_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle; + ub_fh->ns_id = ub_id; + ub_fh->ns_type = 0; + ub_fh->ns_inum = 0; + + /* Open user_B before child exits */ + int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + if (ub_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open user_B"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* With user_B active, user_A should also be active */ + TH_LOG("Testing user_A active when child user_B is active"); + int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd, 0); + + /* Close user_B */ + TH_LOG("Closing user_B"); + close(ub_fd); + + /* user_A should remain active (we hold direct ref) */ + int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd2, 0); + close(ua_fd2); + + /* Close user_A - should become inactive */ + TH_LOG("Closing user_A - should become inactive"); + close(ua_fd); + + ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_LT(ua_fd, 0); +} + +/* + * Test different namespace types (net, uts, ipc) all contributing + * active references to the same owning user namespace. + */ +TEST(ns_mixed_types_same_owner) +{ + struct file_handle *user_handle, *net_handle, *uts_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 u_id, n_id, ut_id; + char u_buf[sizeof(*user_handle) + MAX_HANDLE_SZ]; + char n_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + char ut_buf[sizeof(*uts_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int u_fd = open("/proc/self/ns/user", O_RDONLY); + if (u_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) { + close(u_fd); + close(pipefd[1]); + exit(1); + } + close(u_fd); + + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int n_fd = open("/proc/self/ns/net", O_RDONLY); + if (n_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) { + close(n_fd); + close(pipefd[1]); + exit(1); + } + close(n_fd); + + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + int ut_fd = open("/proc/self/ns/uts", O_RDONLY); + if (ut_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) { + close(ut_fd); + close(pipefd[1]); + exit(1); + } + close(ut_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &u_id, sizeof(u_id)); + write(pipefd[1], &n_id, sizeof(n_id)); + write(pipefd[1], &ut_id, sizeof(ut_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &u_id, sizeof(u_id)); + if (ret != sizeof(u_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID"); + } + + ret = read(pipefd[0], &n_id, sizeof(n_id)); + if (ret != sizeof(n_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + ret = read(pipefd[0], &ut_id, sizeof(ut_id)); + close(pipefd[0]); + if (ret != sizeof(ut_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read UTS namespace ID"); + } + + /* Construct file handles from namespace IDs */ + user_handle = (struct file_handle *)u_buf; + user_handle->handle_bytes = sizeof(struct nsfs_file_handle); + user_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)user_handle->f_handle; + u_fh->ns_id = u_id; + u_fh->ns_type = 0; + u_fh->ns_inum = 0; + + net_handle = (struct file_handle *)n_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)net_handle->f_handle; + n_fh->ns_id = n_id; + n_fh->ns_type = 0; + n_fh->ns_inum = 0; + + uts_handle = (struct file_handle *)ut_buf; + uts_handle->handle_bytes = sizeof(struct nsfs_file_handle); + uts_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)uts_handle->f_handle; + ut_fh->ns_id = ut_id; + ut_fh->ns_type = 0; + ut_fh->ns_inum = 0; + + /* Open both non-user namespaces */ + int n_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + int ut_fd = open_by_handle_at(FD_NSFS_ROOT, uts_handle, O_RDONLY); + if (n_fd < 0 || ut_fd < 0) { + if (n_fd >= 0) close(n_fd); + if (ut_fd >= 0) close(ut_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* User namespace should be active (2 active children) */ + TH_LOG("Both net and uts active - user ns should be active"); + int u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close net - user ns should STILL be active (uts still active) */ + TH_LOG("Closing net - user ns should still be active"); + close(n_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close uts - user ns should become inactive */ + TH_LOG("Closing uts - user ns should become inactive"); + close(ut_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_LT(u_fd, 0); +} + +/* Thread test helpers and structures */ +struct thread_ns_info { + __u64 ns_id; + int pipefd; + int syncfd_read; + int syncfd_write; + int exit_code; +}; + +static void *thread_create_namespace(void *arg) +{ + struct thread_ns_info *info = (struct thread_ns_info *)arg; + int ret; + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + info->exit_code = 1; + return NULL; + } + + /* Get namespace ID */ + int fd = open("/proc/thread-self/ns/net", O_RDONLY); + if (fd < 0) { + info->exit_code = 2; + return NULL; + } + + ret = ioctl(fd, NS_GET_ID, &info->ns_id); + close(fd); + if (ret < 0) { + info->exit_code = 3; + return NULL; + } + + /* Send namespace ID to main thread */ + if (write(info->pipefd, &info->ns_id, sizeof(info->ns_id)) != sizeof(info->ns_id)) { + info->exit_code = 4; + return NULL; + } + + /* Wait for signal to exit */ + char sync_byte; + if (read(info->syncfd_read, &sync_byte, 1) != 1) { + info->exit_code = 5; + return NULL; + } + + info->exit_code = 0; + return NULL; +} + +/* + * Test that namespace becomes inactive after thread exits. + * This verifies active reference counting works with threads, not just processes. + */ +TEST(thread_ns_inactive_after_exit) +{ + pthread_t thread; + struct thread_ns_info info; + struct file_handle *handle; + int pipefd[2]; + int syncpipe[2]; + int ret; + char sync_byte; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + info.pipefd = pipefd[1]; + info.syncfd_read = syncpipe[0]; + info.syncfd_write = -1; + info.exit_code = -1; + + /* Create thread that will create a namespace */ + ret = pthread_create(&thread, NULL, thread_create_namespace, &info); + ASSERT_EQ(ret, 0); + + /* Read namespace ID from thread */ + __u64 ns_id; + ret = read(pipefd[0], &ns_id, sizeof(ns_id)); + if (ret != sizeof(ns_id)) { + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + close(syncpipe[1]); + SKIP(return, "Failed to read namespace ID from thread"); + } + + TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id); + + /* Construct file handle */ + handle = (struct file_handle *)buf; + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle; + fh->ns_id = ns_id; + fh->ns_type = 0; + fh->ns_inum = 0; + + /* Namespace should be active while thread is alive */ + TH_LOG("Attempting to open namespace while thread is alive (should succeed)"); + int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd, 0); + close(nsfd); + + /* Signal thread to exit */ + TH_LOG("Signaling thread to exit"); + sync_byte = 'X'; + ASSERT_EQ(write(syncpipe[1], &sync_byte, 1), 1); + close(syncpipe[1]); + + /* Wait for thread to exit */ + ASSERT_EQ(pthread_join(thread, NULL), 0); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + + if (info.exit_code != 0) + SKIP(return, "Thread failed to create namespace"); + + TH_LOG("Thread exited, namespace should be inactive"); + + /* Namespace should now be inactive */ + nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(nsfd, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that a namespace remains active while a thread holds an fd to it. + * Even after the thread exits, the namespace should remain active as long as + * another thread holds a file descriptor to it. + */ +TEST(thread_ns_fd_keeps_active) +{ + pthread_t thread; + struct thread_ns_info info; + struct file_handle *handle; + int pipefd[2]; + int syncpipe[2]; + int ret; + char sync_byte; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + info.pipefd = pipefd[1]; + info.syncfd_read = syncpipe[0]; + info.syncfd_write = -1; + info.exit_code = -1; + + /* Create thread that will create a namespace */ + ret = pthread_create(&thread, NULL, thread_create_namespace, &info); + ASSERT_EQ(ret, 0); + + /* Read namespace ID from thread */ + __u64 ns_id; + ret = read(pipefd[0], &ns_id, sizeof(ns_id)); + if (ret != sizeof(ns_id)) { + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + close(syncpipe[1]); + SKIP(return, "Failed to read namespace ID from thread"); + } + + TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id); + + /* Construct file handle */ + handle = (struct file_handle *)buf; + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle; + fh->ns_id = ns_id; + fh->ns_type = 0; + fh->ns_inum = 0; + + /* Open namespace while thread is alive */ + TH_LOG("Opening namespace while thread is alive"); + int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd, 0); + + /* Signal thread to exit */ + TH_LOG("Signaling thread to exit"); + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for thread to exit */ + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + + if (info.exit_code != 0) { + close(nsfd); + SKIP(return, "Thread failed to create namespace"); + } + + TH_LOG("Thread exited, but main thread holds fd - namespace should remain active"); + + /* Namespace should still be active because we hold an fd */ + int nsfd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd2, 0); + + /* Verify it's the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(nsfd, &st1), 0); + ASSERT_EQ(fstat(nsfd2, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + close(nsfd2); + + TH_LOG("Closing fd - namespace should become inactive"); + close(nsfd); + + /* Now namespace should be inactive */ + nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(nsfd, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* Structure for thread data in subprocess */ +struct thread_sleep_data { + int syncfd_read; +}; + +static void *thread_sleep_and_wait(void *arg) +{ + struct thread_sleep_data *data = (struct thread_sleep_data *)arg; + char sync_byte; + + /* Wait for signal to exit - read will unblock when pipe is closed */ + (void)read(data->syncfd_read, &sync_byte, 1); + return NULL; +} + +/* + * Test that namespaces become inactive after subprocess with multiple threads exits. + * Create a subprocess that unshares user and network namespaces, then creates two + * threads that share those namespaces. Verify that after all threads and subprocess + * exit, the namespaces are no longer listed by listns() and cannot be opened by + * open_by_handle_at(). + */ +TEST(thread_subprocess_ns_inactive_after_all_exit) +{ + int pipefd[2]; + int sv[2]; + pid_t pid; + int status; + __u64 user_id, net_id; + struct file_handle *user_handle, *net_handle; + char user_buf[sizeof(*user_handle) + MAX_HANDLE_SZ]; + char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + char sync_byte; + int ret; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + close(sv[0]); + + /* Create user namespace with mappings */ + if (setup_userns() < 0) { + fprintf(stderr, "Child: setup_userns() failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: setup_userns() succeeded\n"); + + /* Get user namespace ID */ + int user_fd = open("/proc/self/ns/user", O_RDONLY); + if (user_fd < 0) { + fprintf(stderr, "Child: open(/proc/self/ns/user) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + + if (ioctl(user_fd, NS_GET_ID, &user_id) < 0) { + fprintf(stderr, "Child: ioctl(NS_GET_ID) for user ns failed: %s\n", strerror(errno)); + close(user_fd); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + close(user_fd); + fprintf(stderr, "Child: user ns ID = %llu\n", (unsigned long long)user_id); + + /* Unshare network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + fprintf(stderr, "Child: unshare(CLONE_NEWNET) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: unshare(CLONE_NEWNET) succeeded\n"); + + /* Get network namespace ID */ + int net_fd = open("/proc/self/ns/net", O_RDONLY); + if (net_fd < 0) { + fprintf(stderr, "Child: open(/proc/self/ns/net) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + + if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) { + fprintf(stderr, "Child: ioctl(NS_GET_ID) for net ns failed: %s\n", strerror(errno)); + close(net_fd); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + close(net_fd); + fprintf(stderr, "Child: net ns ID = %llu\n", (unsigned long long)net_id); + + /* Send namespace IDs to parent */ + if (write(pipefd[1], &user_id, sizeof(user_id)) != sizeof(user_id)) { + fprintf(stderr, "Child: write(user_id) failed: %s\n", strerror(errno)); + exit(1); + } + if (write(pipefd[1], &net_id, sizeof(net_id)) != sizeof(net_id)) { + fprintf(stderr, "Child: write(net_id) failed: %s\n", strerror(errno)); + exit(1); + } + close(pipefd[1]); + fprintf(stderr, "Child: sent namespace IDs to parent\n"); + + /* Create two threads that share the namespaces */ + pthread_t thread1, thread2; + struct thread_sleep_data data; + data.syncfd_read = sv[1]; + + int ret_thread = pthread_create(&thread1, NULL, thread_sleep_and_wait, &data); + if (ret_thread != 0) { + fprintf(stderr, "Child: pthread_create(thread1) failed: %s\n", strerror(ret_thread)); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: created thread1\n"); + + ret_thread = pthread_create(&thread2, NULL, thread_sleep_and_wait, &data); + if (ret_thread != 0) { + fprintf(stderr, "Child: pthread_create(thread2) failed: %s\n", strerror(ret_thread)); + close(sv[1]); + pthread_cancel(thread1); + exit(1); + } + fprintf(stderr, "Child: created thread2\n"); + + /* Wait for threads to complete - they will unblock when parent writes */ + fprintf(stderr, "Child: waiting for threads to exit\n"); + pthread_join(thread1, NULL); + fprintf(stderr, "Child: thread1 exited\n"); + pthread_join(thread2, NULL); + fprintf(stderr, "Child: thread2 exited\n"); + + close(sv[1]); + + /* Exit - namespaces should become inactive */ + fprintf(stderr, "Child: all threads joined, exiting with success\n"); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + close(sv[1]); + + TH_LOG("Parent: waiting to read namespace IDs from child"); + + /* Read namespace IDs from child */ + ret = read(pipefd[0], &user_id, sizeof(user_id)); + if (ret != sizeof(user_id)) { + TH_LOG("Parent: failed to read user_id, ret=%d, errno=%s", ret, strerror(errno)); + close(pipefd[0]); + sync_byte = 'X'; + (void)write(sv[0], &sync_byte, 1); + close(sv[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID from child"); + } + + ret = read(pipefd[0], &net_id, sizeof(net_id)); + close(pipefd[0]); + if (ret != sizeof(net_id)) { + TH_LOG("Parent: failed to read net_id, ret=%d, errno=%s", ret, strerror(errno)); + sync_byte = 'X'; + (void)write(sv[0], &sync_byte, 1); + close(sv[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID from child"); + } + + TH_LOG("Child created user ns %llu and net ns %llu with 2 threads", + (unsigned long long)user_id, (unsigned long long)net_id); + + /* Construct file handles */ + user_handle = (struct file_handle *)user_buf; + user_handle->handle_bytes = sizeof(struct nsfs_file_handle); + user_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *user_fh = (struct nsfs_file_handle *)user_handle->f_handle; + user_fh->ns_id = user_id; + user_fh->ns_type = 0; + user_fh->ns_inum = 0; + + net_handle = (struct file_handle *)net_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle; + net_fh->ns_id = net_id; + net_fh->ns_type = 0; + net_fh->ns_inum = 0; + + /* Verify namespaces are active while subprocess and threads are alive */ + TH_LOG("Verifying namespaces are active while subprocess with threads is running"); + int user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(user_fd, 0); + + int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + ASSERT_GE(net_fd, 0); + + close(user_fd); + close(net_fd); + + /* Also verify they appear in listns() */ + TH_LOG("Verifying namespaces appear in listns() while active"); + struct ns_id_req req = { + .size = sizeof(struct ns_id_req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids < 0) { + TH_LOG("listns() not available, skipping listns verification"); + } else { + /* Check if user_id is in the list */ + int found_user = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == user_id) { + found_user = 1; + break; + } + } + ASSERT_TRUE(found_user); + TH_LOG("User namespace found in listns() as expected"); + + /* Check network namespace */ + req.ns_type = CLONE_NEWNET; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_net = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == net_id) { + found_net = 1; + break; + } + } + ASSERT_TRUE(found_net); + TH_LOG("Network namespace found in listns() as expected"); + } + } + + /* Signal threads to exit */ + TH_LOG("Signaling threads to exit"); + sync_byte = 'X'; + /* Write two bytes - one for each thread */ + ASSERT_EQ(write(sv[0], &sync_byte, 1), 1); + ASSERT_EQ(write(sv[0], &sync_byte, 1), 1); + close(sv[0]); + + /* Wait for child process to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + if (WEXITSTATUS(status) != 0) { + TH_LOG("Child process failed with exit code %d", WEXITSTATUS(status)); + SKIP(return, "Child process failed"); + } + + TH_LOG("Subprocess and all threads have exited successfully"); + + /* Verify namespaces are now inactive - open_by_handle_at should fail */ + TH_LOG("Verifying namespaces are inactive after subprocess and threads exit"); + user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_LT(user_fd, 0); + TH_LOG("User namespace inactive as expected: %s (errno=%d)", + strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); + + net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + ASSERT_LT(net_fd, 0); + TH_LOG("Network namespace inactive as expected: %s (errno=%d)", + strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); + + /* Verify namespaces do NOT appear in listns() */ + TH_LOG("Verifying namespaces do NOT appear in listns() when inactive"); + memset(&req, 0, sizeof(req)); + req.size = sizeof(struct ns_id_req); + req.ns_type = CLONE_NEWUSER; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_user = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == user_id) { + found_user = 1; + break; + } + } + ASSERT_FALSE(found_user); + TH_LOG("User namespace correctly not listed in listns()"); + + /* Check network namespace */ + req.ns_type = CLONE_NEWNET; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_net = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == net_id) { + found_net = 1; + break; + } + } + ASSERT_FALSE(found_net); + TH_LOG("Network namespace correctly not listed in listns()"); + } + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c index e28accd74a57..527ade0a8673 100644 --- a/tools/testing/selftests/namespaces/nsid_test.c +++ b/tools/testing/selftests/namespaces/nsid_test.c @@ -6,6 +6,7 @@ #include <libgen.h> #include <limits.h> #include <pthread.h> +#include <signal.h> #include <string.h> #include <sys/mount.h> #include <poll.h> @@ -14,12 +15,30 @@ #include <sys/stat.h> #include <sys/socket.h> #include <sys/un.h> +#include <sys/wait.h> #include <unistd.h> #include <linux/fs.h> #include <linux/limits.h> #include <linux/nsfs.h> #include "../kselftest_harness.h" +/* Fixture for tests that create child processes */ +FIXTURE(nsid) { + pid_t child_pid; +}; + +FIXTURE_SETUP(nsid) { + self->child_pid = 0; +} + +FIXTURE_TEARDOWN(nsid) { + /* Clean up any child process that may still be running */ + if (self->child_pid > 0) { + kill(self->child_pid, SIGKILL); + waitpid(self->child_pid, NULL, 0); + } +} + TEST(nsid_mntns_basic) { __u64 mnt_ns_id = 0; @@ -44,7 +63,7 @@ TEST(nsid_mntns_basic) close(fd_mntns); } -TEST(nsid_mntns_separate) +TEST_F(nsid, mntns_separate) { __u64 parent_mnt_ns_id = 0; __u64 child_mnt_ns_id = 0; @@ -90,6 +109,9 @@ TEST(nsid_mntns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -99,8 +121,6 @@ TEST(nsid_mntns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_mntns); SKIP(return, "No permission to create mount namespace"); } @@ -123,10 +143,6 @@ TEST(nsid_mntns_separate) close(fd_parent_mntns); close(fd_child_mntns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_cgroupns_basic) @@ -153,7 +169,7 @@ TEST(nsid_cgroupns_basic) close(fd_cgroupns); } -TEST(nsid_cgroupns_separate) +TEST_F(nsid, cgroupns_separate) { __u64 parent_cgroup_ns_id = 0; __u64 child_cgroup_ns_id = 0; @@ -199,6 +215,9 @@ TEST(nsid_cgroupns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -208,8 +227,6 @@ TEST(nsid_cgroupns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_cgroupns); SKIP(return, "No permission to create cgroup namespace"); } @@ -232,10 +249,6 @@ TEST(nsid_cgroupns_separate) close(fd_parent_cgroupns); close(fd_child_cgroupns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_ipcns_basic) @@ -262,7 +275,7 @@ TEST(nsid_ipcns_basic) close(fd_ipcns); } -TEST(nsid_ipcns_separate) +TEST_F(nsid, ipcns_separate) { __u64 parent_ipc_ns_id = 0; __u64 child_ipc_ns_id = 0; @@ -308,6 +321,9 @@ TEST(nsid_ipcns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -317,8 +333,6 @@ TEST(nsid_ipcns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_ipcns); SKIP(return, "No permission to create IPC namespace"); } @@ -341,10 +355,6 @@ TEST(nsid_ipcns_separate) close(fd_parent_ipcns); close(fd_child_ipcns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_utsns_basic) @@ -371,7 +381,7 @@ TEST(nsid_utsns_basic) close(fd_utsns); } -TEST(nsid_utsns_separate) +TEST_F(nsid, utsns_separate) { __u64 parent_uts_ns_id = 0; __u64 child_uts_ns_id = 0; @@ -417,6 +427,9 @@ TEST(nsid_utsns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -426,8 +439,6 @@ TEST(nsid_utsns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_utsns); SKIP(return, "No permission to create UTS namespace"); } @@ -450,10 +461,6 @@ TEST(nsid_utsns_separate) close(fd_parent_utsns); close(fd_child_utsns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_userns_basic) @@ -480,7 +487,7 @@ TEST(nsid_userns_basic) close(fd_userns); } -TEST(nsid_userns_separate) +TEST_F(nsid, userns_separate) { __u64 parent_user_ns_id = 0; __u64 child_user_ns_id = 0; @@ -526,6 +533,9 @@ TEST(nsid_userns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -535,8 +545,6 @@ TEST(nsid_userns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_userns); SKIP(return, "No permission to create user namespace"); } @@ -559,10 +567,6 @@ TEST(nsid_userns_separate) close(fd_parent_userns); close(fd_child_userns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_timens_basic) @@ -591,7 +595,7 @@ TEST(nsid_timens_basic) close(fd_timens); } -TEST(nsid_timens_separate) +TEST_F(nsid, timens_separate) { __u64 parent_time_ns_id = 0; __u64 child_time_ns_id = 0; @@ -652,6 +656,9 @@ TEST(nsid_timens_separate) } } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -660,8 +667,6 @@ TEST(nsid_timens_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_timens); close(pipefd[0]); SKIP(return, "Cannot create time namespace"); @@ -689,10 +694,6 @@ TEST(nsid_timens_separate) close(fd_parent_timens); close(fd_child_timens); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_pidns_basic) @@ -719,7 +720,7 @@ TEST(nsid_pidns_basic) close(fd_pidns); } -TEST(nsid_pidns_separate) +TEST_F(nsid, pidns_separate) { __u64 parent_pid_ns_id = 0; __u64 child_pid_ns_id = 0; @@ -776,6 +777,9 @@ TEST(nsid_pidns_separate) } } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -784,8 +788,6 @@ TEST(nsid_pidns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_pidns); close(pipefd[0]); SKIP(return, "No permission to create PID namespace"); @@ -813,10 +815,6 @@ TEST(nsid_pidns_separate) close(fd_parent_pidns); close(fd_child_pidns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_netns_basic) @@ -860,7 +858,7 @@ TEST(nsid_netns_basic) close(fd_netns); } -TEST(nsid_netns_separate) +TEST_F(nsid, netns_separate) { __u64 parent_net_ns_id = 0; __u64 parent_netns_cookie = 0; @@ -920,6 +918,9 @@ TEST(nsid_netns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -929,8 +930,6 @@ TEST(nsid_netns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_netns); close(parent_sock); SKIP(return, "No permission to create network namespace"); @@ -977,10 +976,6 @@ TEST(nsid_netns_separate) close(fd_parent_netns); close(fd_child_netns); close(parent_sock); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c new file mode 100644 index 000000000000..753fd29dffd8 --- /dev/null +++ b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> +#include "../pidfd/pidfd.h" +#include "../kselftest_harness.h" + +/* + * Regression tests for the setns(pidfd) active reference counting bug. + * + * These tests are based on the reproducers that triggered the race condition + * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly"). + * + * The bug: When using setns() with a pidfd, if the target task exits between + * prepare_nsset() and commit_nsset(), the namespaces would become inactive. + * Then ns_ref_active_get() would increment from 0 without properly resurrecting + * the owner chain, causing active reference count underflows. + */ + +/* + * Simple pidfd setns test using create_child()+unshare(). + * + * Without the fix, this would trigger active refcount warnings when the + * parent exits after doing setns(pidfd) on a child that has already exited. + */ +TEST(simple_pidfd_setns) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + int sv[2]; + char c; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create a child process without namespaces initially */ + child_pid = create_child(&pidfd, 0); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + close(sv[0]); + + if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) { + close(sv[1]); + _exit(1); + } + + /* Signal parent that namespaces are ready */ + if (write_nointr(sv[1], "1", 1) < 0) { + close(sv[1]); + _exit(1); + } + + close(sv[1]); + _exit(0); + } + ASSERT_GE(pidfd, 0); + EXPECT_EQ(close(sv[1]), 0); + + ret = read_nointr(sv[0], &c, 1); + ASSERT_EQ(ret, 1); + EXPECT_EQ(close(sv[0]), 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + TH_LOG("setns() returned %d", ret); + close(pidfd); +} + +/* + * Simple pidfd setns test using create_child(). + * + * This variation uses create_child() with namespace flags directly. + * Namespaces are created immediately at clone time. + */ +TEST(simple_pidfd_setns_clone) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + /* Create a child process with new namespaces using create_child() */ + child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + /* Child: sleep for a while so parent can setns to us */ + sleep(2); + _exit(0); + } + + /* Parent: pidfd was already created by create_child() */ + ASSERT_GE(pidfd, 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + close(pidfd); + TH_LOG("setns() returned %d", ret); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/siocgskns_test.c b/tools/testing/selftests/namespaces/siocgskns_test.c new file mode 100644 index 000000000000..ba689a22d82f --- /dev/null +++ b/tools/testing/selftests/namespaces/siocgskns_test.c @@ -0,0 +1,1824 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/if.h> +#include <linux/sockios.h> +#include <linux/nsfs.h> +#include <arpa/inet.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +#ifndef SIOCGSKNS +#define SIOCGSKNS 0x894C +#endif + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 +#endif + +#ifndef FILEID_NSFS +#define FILEID_NSFS 0xf1 +#endif + +/* + * Test basic SIOCGSKNS functionality. + * Create a socket and verify SIOCGSKNS returns the correct network namespace. + */ +TEST(siocgskns_basic) +{ + int sock_fd, netns_fd, current_netns_fd; + struct stat st1, st2; + + /* Create a TCP socket */ + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Use SIOCGSKNS to get network namespace */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Get current network namespace */ + current_netns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(current_netns_fd, 0); + + /* Verify they match */ + ASSERT_EQ(fstat(netns_fd, &st1), 0); + ASSERT_EQ(fstat(current_netns_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(sock_fd); + close(netns_fd); + close(current_netns_fd); +} + +/* + * Test that socket file descriptors keep network namespaces active. + * Create a network namespace, create a socket in it, then exit the namespace. + * The namespace should remain active while the socket FD is held. + */ +TEST(siocgskns_keeps_netns_active) +{ + int sock_fd, netns_fd, test_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno)); + close(ipc_sockets[1]); + exit(1); + } + + /* Create a socket in the new network namespace */ + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + TH_LOG("socket() failed: %s", strerror(errno)); + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS); + + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st), 0); + + /* + * Namespace should still be active because socket FD keeps it alive. + * Try to access it via /proc/self/fd/<fd>. + */ + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fd); + test_fd = open(path, O_RDONLY); + ASSERT_GE(test_fd, 0); + close(test_fd); + close(netns_fd); + + /* Close socket - namespace should become inactive */ + close(sock_fd); + + /* Try SIOCGSKNS again - should fail since socket is closed */ + ASSERT_LT(ioctl(sock_fd, SIOCGSKNS), 0); +} + +/* + * Test SIOCGSKNS with different socket types (TCP, UDP, RAW). + */ +TEST(siocgskns_socket_types) +{ + int sock_tcp, sock_udp, sock_raw; + int netns_tcp, netns_udp, netns_raw; + struct stat st_tcp, st_udp, st_raw; + + /* TCP socket */ + sock_tcp = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_tcp, 0); + + /* UDP socket */ + sock_udp = socket(AF_INET, SOCK_DGRAM, 0); + ASSERT_GE(sock_udp, 0); + + /* RAW socket (may require privileges) */ + sock_raw = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP); + if (sock_raw < 0 && (errno == EPERM || errno == EACCES)) { + sock_raw = -1; /* Skip raw socket test */ + } + + /* Test SIOCGSKNS on TCP */ + netns_tcp = ioctl(sock_tcp, SIOCGSKNS); + if (netns_tcp < 0) { + close(sock_tcp); + close(sock_udp); + if (sock_raw >= 0) close(sock_raw); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_tcp, 0); + } + + /* Test SIOCGSKNS on UDP */ + netns_udp = ioctl(sock_udp, SIOCGSKNS); + ASSERT_GE(netns_udp, 0); + + /* Test SIOCGSKNS on RAW (if available) */ + if (sock_raw >= 0) { + netns_raw = ioctl(sock_raw, SIOCGSKNS); + ASSERT_GE(netns_raw, 0); + } + + /* Verify all return the same network namespace */ + ASSERT_EQ(fstat(netns_tcp, &st_tcp), 0); + ASSERT_EQ(fstat(netns_udp, &st_udp), 0); + ASSERT_EQ(st_tcp.st_ino, st_udp.st_ino); + + if (sock_raw >= 0) { + ASSERT_EQ(fstat(netns_raw, &st_raw), 0); + ASSERT_EQ(st_tcp.st_ino, st_raw.st_ino); + close(netns_raw); + close(sock_raw); + } + + close(netns_tcp); + close(netns_udp); + close(sock_tcp); + close(sock_udp); +} + +/* + * Test SIOCGSKNS across setns. + * Create a socket in netns A, switch to netns B, verify SIOCGSKNS still + * returns netns A. + */ +TEST(siocgskns_across_setns) +{ + int sock_fd, netns_a_fd, netns_b_fd, result_fd; + struct stat st_a; + + /* Get current netns (A) */ + netns_a_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(netns_a_fd, 0); + ASSERT_EQ(fstat(netns_a_fd, &st_a), 0); + + /* Create socket in netns A */ + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Create new netns (B) */ + ASSERT_EQ(unshare(CLONE_NEWNET), 0); + + netns_b_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(netns_b_fd, 0); + + /* Get netns from socket created in A */ + result_fd = ioctl(sock_fd, SIOCGSKNS); + if (result_fd < 0) { + close(sock_fd); + setns(netns_a_fd, CLONE_NEWNET); + close(netns_a_fd); + close(netns_b_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(result_fd, 0); + } + + /* Verify it still points to netns A */ + struct stat st_result_stat; + ASSERT_EQ(fstat(result_fd, &st_result_stat), 0); + ASSERT_EQ(st_a.st_ino, st_result_stat.st_ino); + + close(result_fd); + close(sock_fd); + close(netns_b_fd); + + /* Restore original netns */ + ASSERT_EQ(setns(netns_a_fd, CLONE_NEWNET), 0); + close(netns_a_fd); +} + +/* + * Test SIOCGSKNS fails on non-socket file descriptors. + */ +TEST(siocgskns_non_socket) +{ + int fd; + int pipefd[2]; + + /* Test on regular file */ + fd = open("/dev/null", O_RDONLY); + ASSERT_GE(fd, 0); + + ASSERT_LT(ioctl(fd, SIOCGSKNS), 0); + ASSERT_TRUE(errno == ENOTTY || errno == EINVAL); + close(fd); + + /* Test on pipe */ + ASSERT_EQ(pipe(pipefd), 0); + + ASSERT_LT(ioctl(pipefd[0], SIOCGSKNS), 0); + ASSERT_TRUE(errno == ENOTTY || errno == EINVAL); + + close(pipefd[0]); + close(pipefd[1]); +} + +/* + * Test multiple sockets keep the same network namespace active. + * Create multiple sockets, verify closing some doesn't affect others. + */ +TEST(siocgskns_multiple_sockets) +{ + int socks[5]; + int netns_fds[5]; + int i; + struct stat st; + ino_t netns_ino; + + /* Create new network namespace */ + ASSERT_EQ(unshare(CLONE_NEWNET), 0); + + /* Create multiple sockets */ + for (i = 0; i < 5; i++) { + socks[i] = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(socks[i], 0); + } + + /* Get netns from all sockets */ + for (i = 0; i < 5; i++) { + netns_fds[i] = ioctl(socks[i], SIOCGSKNS); + if (netns_fds[i] < 0) { + int j; + for (j = 0; j <= i; j++) { + close(socks[j]); + if (j < i && netns_fds[j] >= 0) + close(netns_fds[j]); + } + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fds[i], 0); + } + } + + /* Verify all point to same netns */ + ASSERT_EQ(fstat(netns_fds[0], &st), 0); + netns_ino = st.st_ino; + + for (i = 1; i < 5; i++) { + ASSERT_EQ(fstat(netns_fds[i], &st), 0); + ASSERT_EQ(st.st_ino, netns_ino); + } + + /* Close some sockets */ + for (i = 0; i < 3; i++) { + close(socks[i]); + } + + /* Remaining netns FDs should still be valid */ + for (i = 3; i < 5; i++) { + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fds[i]); + int test_fd = open(path, O_RDONLY); + ASSERT_GE(test_fd, 0); + close(test_fd); + } + + /* Cleanup */ + for (i = 0; i < 5; i++) { + if (i >= 3) + close(socks[i]); + close(netns_fds[i]); + } +} + +/* + * Test socket keeps netns active after creating process exits. + * Verify that as long as the socket FD exists, the namespace remains active. + */ +TEST(siocgskns_netns_lifecycle) +{ + int sock_fd, netns_fd; + int ipc_sockets[2]; + int syncpipe[2]; + pid_t pid; + int status; + char sync_byte; + struct stat st; + ino_t netns_ino; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + ASSERT_EQ(pipe(syncpipe), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child */ + close(ipc_sockets[0]); + close(syncpipe[1]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + /* Send socket to parent */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + + /* Wait for parent signal */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + /* Parent */ + close(ipc_sockets[1]); + close(syncpipe[0]); + + /* Receive socket FD */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Get netns from socket while child is alive */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + close(sock_fd); + waitpid(pid, NULL, 0); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + ASSERT_EQ(fstat(netns_fd, &st), 0); + netns_ino = st.st_ino; + + /* Signal child to exit */ + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* + * Socket FD should still keep namespace active even after + * the creating process exited. + */ + int test_fd = ioctl(sock_fd, SIOCGSKNS); + ASSERT_GE(test_fd, 0); + + struct stat st_test; + ASSERT_EQ(fstat(test_fd, &st_test), 0); + ASSERT_EQ(st_test.st_ino, netns_ino); + + close(test_fd); + close(netns_fd); + + /* Close socket - namespace should become inactive */ + close(sock_fd); +} + +/* + * Test IPv6 sockets also work with SIOCGSKNS. + */ +TEST(siocgskns_ipv6) +{ + int sock_fd, netns_fd, current_netns_fd; + struct stat st1, st2; + + /* Create an IPv6 TCP socket */ + sock_fd = socket(AF_INET6, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Use SIOCGSKNS */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Verify it matches current namespace */ + current_netns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(current_netns_fd, 0); + + ASSERT_EQ(fstat(netns_fd, &st1), 0); + ASSERT_EQ(fstat(current_netns_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(sock_fd); + close(netns_fd); + close(current_netns_fd); +} + +/* + * Test that socket-kept netns appears in listns() output. + * Verify that a network namespace kept alive by a socket FD appears in + * listns() output even after the creating process exits, and that it + * disappears when the socket is closed. + */ +TEST(siocgskns_listns_visibility) +{ + int sock_fd, netns_fd, owner_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + __u64 netns_id, owner_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + bool found_netns = false; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace */ + owner_fd = ioctl(netns_fd, NS_GET_USERNS); + if (owner_fd < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(owner_fd, 0); + } + + /* Get owner namespace ID */ + ret = ioctl(owner_fd, NS_GET_ID, &owner_id); + if (ret < 0) { + close(owner_fd); + close(sock_fd); + close(netns_fd); + ASSERT_EQ(ret, 0); + } + close(owner_fd); + + /* Namespace should appear in listns() output */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + /* Search for our network namespace in the list */ + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_TRUE(found_netns); + TH_LOG("Found netns %llu in listns() output (kept alive by socket)", netns_id); + + /* Now verify with owner filtering */ + req.user_ns_id = owner_id; + found_netns = false; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_TRUE(found_netns); + TH_LOG("Found netns %llu owned by userns %llu", netns_id, owner_id); + + /* Close socket - namespace should become inactive and disappear from listns() */ + close(sock_fd); + close(netns_fd); + + /* Verify it's no longer in listns() output */ + req.user_ns_id = 0; + found_netns = false; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_FALSE(found_netns); + TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id); +} + +/* + * Test that socket-kept netns can be reopened via file handle. + * Verify that a network namespace kept alive by a socket FD can be + * reopened using file handles even after the creating process exits. + */ +TEST(siocgskns_file_handle) +{ + int sock_fd, netns_fd, reopened_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st1, st2; + ino_t netns_ino; + __u64 netns_id; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + int ret; + + /* Allocate file_handle structure for nsfs */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st1), 0); + netns_ino = st1.st_ino; + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Construct file handle from namespace ID */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_id; + nsfs_fh->ns_type = 0; /* Type field not needed for reopening */ + nsfs_fh->ns_inum = 0; /* Inum field not needed for reopening */ + + TH_LOG("Constructed file handle for netns %lu (id=%llu)", netns_ino, netns_id); + + /* Reopen namespace using file handle (while socket still keeps it alive) */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + /* Verify it's the same namespace */ + ASSERT_EQ(fstat(reopened_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + TH_LOG("Successfully reopened netns %lu via file handle", netns_ino); + + close(reopened_fd); + + /* Close the netns FD */ + close(netns_fd); + + /* Try to reopen via file handle - should fail since namespace is now inactive */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(reopened_fd, 0); + TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno)); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Reopen namespace using file handle (while socket still keeps it alive) */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + /* Verify it's the same namespace */ + ASSERT_EQ(fstat(reopened_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + TH_LOG("Successfully reopened netns %lu via file handle", netns_ino); + + /* Close socket - namespace should become inactive */ + close(sock_fd); + free(handle); +} + +/* + * Test combined listns() and file handle operations with socket-kept netns. + * Create a netns, keep it alive with a socket, verify it appears in listns(), + * then reopen it via file handle obtained from listns() entry. + */ +TEST(siocgskns_listns_and_file_handle) +{ + int sock_fd, netns_fd, userns_fd, reopened_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st; + ino_t netns_ino; + __u64 netns_id, userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + bool found_netns = false, found_userns = false; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + + /* Allocate file_handle structure for nsfs */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new userns and netns with socket */ + close(ipc_sockets[0]); + + if (setup_userns() < 0) { + close(ipc_sockets[1]); + exit(1); + } + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st), 0); + netns_ino = st.st_ino; + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace */ + userns_fd = ioctl(netns_fd, NS_GET_USERNS); + if (userns_fd < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(userns_fd, 0); + } + + /* Get owner namespace ID */ + ret = ioctl(userns_fd, NS_GET_ID, &userns_id); + if (ret < 0) { + close(userns_fd); + free(handle); + close(sock_fd); + close(netns_fd); + ASSERT_EQ(ret, 0); + } + close(userns_fd); + + TH_LOG("Testing netns %lu (id=%llu) owned by userns id=%llu", netns_ino, netns_id, userns_id); + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_TRUE(found_netns); + ASSERT_TRUE(found_userns); + TH_LOG("Found netns %llu in listns() output", netns_id); + + /* Construct file handle from namespace ID */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_id; + nsfs_fh->ns_type = 0; + nsfs_fh->ns_inum = 0; + + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + struct stat reopened_st; + ASSERT_EQ(fstat(reopened_fd, &reopened_st), 0); + ASSERT_EQ(reopened_st.st_ino, netns_ino); + + TH_LOG("Successfully reopened netns %lu via file handle (socket-kept)", netns_ino); + + close(reopened_fd); + close(netns_fd); + + /* Try to reopen via file handle - should fail since namespace is now inactive */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(reopened_fd, 0); + TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno)); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_TRUE(found_netns); + ASSERT_TRUE(found_userns); + TH_LOG("Found netns %llu in listns() output", netns_id); + + close(netns_fd); + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_FALSE(found_netns); + ASSERT_FALSE(found_userns); + TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id); + + close(sock_fd); + free(handle); +} + +/* + * Test multi-level namespace resurrection across three user namespace levels. + * + * This test creates a complex namespace hierarchy with three levels of user + * namespaces and a network namespace at the deepest level. It verifies that + * the resurrection semantics work correctly when SIOCGSKNS is called on a + * socket from an inactive namespace tree, and that listns() and + * open_by_handle_at() correctly respect visibility rules. + * + * Hierarchy after child processes exit (all with 0 active refcount): + * + * net_L3A (0) <- Level 3 network namespace + * | + * + + * userns_L3 (0) <- Level 3 user namespace + * | + * + + * userns_L2 (0) <- Level 2 user namespace + * | + * + + * userns_L1 (0) <- Level 1 user namespace + * | + * x + * init_user_ns + * + * The test verifies: + * 1. SIOCGSKNS on a socket from inactive net_L3A resurrects the entire chain + * 2. After resurrection, all namespaces are visible in listns() + * 3. Resurrected namespaces can be reopened via file handles + * 4. Closing the netns FD cascades down: the entire ownership chain + * (userns_L3 -> userns_L2 -> userns_L1) becomes inactive again + * 5. Inactive namespaces disappear from listns() and cannot be reopened + * 6. Calling SIOCGSKNS again on the same socket resurrects the tree again + * 7. After second resurrection, namespaces are visible and can be reopened + */ +TEST(siocgskns_multilevel_resurrection) +{ + int ipc_sockets[2]; + pid_t pid_l1, pid_l2, pid_l3; + int status; + + /* Namespace file descriptors to be received from child */ + int sock_L3A_fd = -1; + int netns_L3A_fd = -1; + __u64 netns_L3A_id; + __u64 userns_L1_id, userns_L2_id, userns_L3_id; + + /* For listns() and file handle testing */ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + int reopened_fd; + + /* Allocate file handle for testing */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + /* + * Fork level 1 child that creates userns_L1 + */ + pid_l1 = fork(); + ASSERT_GE(pid_l1, 0); + + if (pid_l1 == 0) { + /* Level 1 child */ + int ipc_L2[2]; + close(ipc_sockets[0]); + + /* Create userns_L1 */ + if (setup_userns() < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Create socketpair for communicating with L2 child */ + if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L2) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* + * Fork level 2 child that creates userns_L2 + */ + pid_l2 = fork(); + if (pid_l2 < 0) { + close(ipc_sockets[1]); + close(ipc_L2[0]); + close(ipc_L2[1]); + exit(1); + } + + if (pid_l2 == 0) { + /* Level 2 child */ + int ipc_L3[2]; + close(ipc_L2[0]); + + /* Create userns_L2 (nested inside userns_L1) */ + if (setup_userns() < 0) { + close(ipc_L2[1]); + exit(1); + } + + /* Create socketpair for communicating with L3 child */ + if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L3) < 0) { + close(ipc_L2[1]); + exit(1); + } + + /* + * Fork level 3 child that creates userns_L3 and network namespaces + */ + pid_l3 = fork(); + if (pid_l3 < 0) { + close(ipc_L2[1]); + close(ipc_L3[0]); + close(ipc_L3[1]); + exit(1); + } + + if (pid_l3 == 0) { + /* Level 3 child - the deepest level */ + int sock_fd; + close(ipc_L3[0]); + + /* Create userns_L3 (nested inside userns_L2) */ + if (setup_userns() < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Create network namespace at level 3 */ + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Create socket in net_L3A */ + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Send socket FD to L2 parent */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_L3[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_L3[1]); + exit(1); + } + + close(sock_fd); + close(ipc_L3[1]); + exit(0); + } + + /* Level 2 child - receive from L3 and forward to L1 */ + close(ipc_L3[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + int received_fd; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_L3[0], &msg, 0); + close(ipc_L3[0]); + + if (n != 1) { + close(ipc_L2[1]); + waitpid(pid_l3, NULL, 0); + exit(1); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + close(ipc_L2[1]); + waitpid(pid_l3, NULL, 0); + exit(1); + } + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L3 child */ + waitpid(pid_l3, NULL, 0); + + /* Forward the socket FD to L1 parent */ + memset(&msg, 0, sizeof(msg)); + buf[0] = 'Y'; + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int)); + + if (sendmsg(ipc_L2[1], &msg, 0) < 0) { + close(received_fd); + close(ipc_L2[1]); + exit(1); + } + + close(received_fd); + close(ipc_L2[1]); + exit(0); + } + + /* Level 1 child - receive from L2 and forward to parent */ + close(ipc_L2[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + int received_fd; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_L2[0], &msg, 0); + close(ipc_L2[0]); + + if (n != 1) { + close(ipc_sockets[1]); + waitpid(pid_l2, NULL, 0); + exit(1); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + close(ipc_sockets[1]); + waitpid(pid_l2, NULL, 0); + exit(1); + } + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L2 child */ + waitpid(pid_l2, NULL, 0); + + /* Forward the socket FD to parent */ + memset(&msg, 0, sizeof(msg)); + buf[0] = 'Z'; + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(received_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(received_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent - receive the socket from the deepest level */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + + if (n != 1) { + free(handle); + waitpid(pid_l1, NULL, 0); + SKIP(return, "Failed to receive socket from child"); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + free(handle); + waitpid(pid_l1, NULL, 0); + SKIP(return, "Failed to receive socket from child"); + } + memcpy(&sock_L3A_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L1 child */ + waitpid(pid_l1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * At this point, all child processes have exited. The socket itself + * doesn't keep the namespace active - we need to call SIOCGSKNS which + * will resurrect the entire namespace tree by taking active references. + */ + + /* Get network namespace from socket - this resurrects the tree */ + netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS); + if (netns_L3A_fd < 0) { + free(handle); + close(sock_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_L3A_fd, 0); + } + + /* Get namespace ID for net_L3A */ + ret = ioctl(netns_L3A_fd, NS_GET_ID, &netns_L3A_id); + if (ret < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace chain: userns_L3 -> userns_L2 -> userns_L1 */ + int userns_L3_fd = ioctl(netns_L3A_fd, NS_GET_USERNS); + if (userns_L3_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(userns_L3_fd, 0); + } + + ret = ioctl(userns_L3_fd, NS_GET_ID, &userns_L3_id); + ASSERT_EQ(ret, 0); + + int userns_L2_fd = ioctl(userns_L3_fd, NS_GET_USERNS); + ASSERT_GE(userns_L2_fd, 0); + ret = ioctl(userns_L2_fd, NS_GET_ID, &userns_L2_id); + ASSERT_EQ(ret, 0); + + int userns_L1_fd = ioctl(userns_L2_fd, NS_GET_USERNS); + ASSERT_GE(userns_L1_fd, 0); + ret = ioctl(userns_L1_fd, NS_GET_ID, &userns_L1_id); + ASSERT_EQ(ret, 0); + + close(userns_L1_fd); + close(userns_L2_fd); + close(userns_L3_fd); + + TH_LOG("Multi-level hierarchy: net_L3A (id=%llu) -> userns_L3 (id=%llu) -> userns_L2 (id=%llu) -> userns_L1 (id=%llu)", + netns_L3A_id, userns_L3_id, userns_L2_id, userns_L1_id); + + /* + * Test 1: Verify net_L3A is visible in listns() after resurrection. + * The entire ownership chain should be resurrected and visible. + */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + bool found_netns_L3A = false; + bool found_userns_L1 = false; + bool found_userns_L2 = false; + bool found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_TRUE(found_netns_L3A); + ASSERT_TRUE(found_userns_L1); + ASSERT_TRUE(found_userns_L2); + ASSERT_TRUE(found_userns_L3); + TH_LOG("Resurrection verified: all namespaces in hierarchy visible in listns()"); + + /* + * Test 2: Verify net_L3A can be reopened via file handle. + */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_L3A_id; + nsfs_fh->ns_type = 0; + nsfs_fh->ns_inum = 0; + + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + close(reopened_fd); + TH_LOG("File handle test passed: net_L3A can be reopened"); + + /* + * Test 3: Verify that when we close the netns FD (dropping the last + * active reference), the entire tree becomes inactive and disappears + * from listns(). The cascade goes: net_L3A drops -> userns_L3 drops -> + * userns_L2 drops -> userns_L1 drops. + */ + close(netns_L3A_fd); + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_netns_L3A = false; + found_userns_L1 = false; + found_userns_L2 = false; + found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_FALSE(found_netns_L3A); + ASSERT_FALSE(found_userns_L1); + ASSERT_FALSE(found_userns_L2); + ASSERT_FALSE(found_userns_L3); + TH_LOG("Cascade test passed: all namespaces disappeared after netns FD closed"); + + /* + * Test 4: Verify file handle no longer works for inactive namespace. + */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd >= 0) { + close(reopened_fd); + free(handle); + ASSERT_TRUE(false); /* Should have failed */ + } + TH_LOG("Inactive namespace correctly cannot be reopened via file handle"); + + /* + * Test 5: Verify that calling SIOCGSKNS again resurrects the tree again. + * The socket is still valid, so we can call SIOCGSKNS on it to resurrect + * the namespace tree once more. + */ + netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS); + ASSERT_GE(netns_L3A_fd, 0); + + TH_LOG("Called SIOCGSKNS again to resurrect the namespace tree"); + + /* Verify the namespace tree is resurrected and visible in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_netns_L3A = false; + found_userns_L1 = false; + found_userns_L2 = false; + found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_TRUE(found_netns_L3A); + ASSERT_TRUE(found_userns_L1); + ASSERT_TRUE(found_userns_L2); + ASSERT_TRUE(found_userns_L3); + TH_LOG("Second resurrection verified: all namespaces in hierarchy visible in listns() again"); + + /* Verify we can reopen via file handle again */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + TH_LOG("open_by_handle_at failed after second resurrection: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + close(reopened_fd); + TH_LOG("File handle test passed: net_L3A can be reopened after second resurrection"); + + /* Final cleanup */ + close(sock_L3A_fd); + close(netns_L3A_fd); + free(handle); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/stress_test.c b/tools/testing/selftests/namespaces/stress_test.c new file mode 100644 index 000000000000..dd7df7d6cb27 --- /dev/null +++ b/tools/testing/selftests/namespaces/stress_test.c @@ -0,0 +1,626 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/nsfs.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Stress tests for namespace active reference counting. + * + * These tests validate that the active reference counting system can handle + * high load scenarios including rapid namespace creation/destruction, large + * numbers of concurrent namespaces, and various edge cases under stress. + */ + +/* + * Test rapid creation and destruction of user namespaces. + * Create and destroy namespaces in quick succession to stress the + * active reference tracking and ensure no leaks occur. + */ +TEST(rapid_namespace_creation_destruction) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[256], ns_ids_after[256]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline count of active user namespaces */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + /* Rapidly create and destroy 100 user namespaces */ + for (i = 0; i < 100; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create user namespace and immediately exit */ + if (setup_userns() < 0) + exit(1); + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + } + + /* Verify we're back to baseline (no leaked namespaces) */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 100 rapid create/destroy cycles: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test creating many concurrent namespaces. + * Verify that listns() correctly tracks all of them and that they all + * become inactive after processes exit. + */ +TEST(many_concurrent_namespaces) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_during[512], ns_ids_after[512]; + ssize_t ret_before, ret_during, ret_after; + pid_t pids[50]; + int num_children = 50; + int i; + int sv[2]; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create many children, each with their own user namespace */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + /* Child: create user namespace and wait for parent signal */ + char c; + + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + /* If we fail to read, kill all children and exit */ + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* List namespaces while all children are running */ + ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0); + ASSERT_GE(ret_during, 0); + + TH_LOG("With %d children running: %zd active user namespaces", num_children, ret_during); + + /* Should have at least num_children more namespaces than baseline */ + ASSERT_GE(ret_during, ret_before + num_children); + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + /* If we fail to write, kill remaining children */ + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Wait for all children */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After all children exit: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test rapid namespace creation with different namespace types. + * Create multiple types of namespaces rapidly to stress the tracking system. + */ +TEST(rapid_mixed_namespace_creation) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline count */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces (all types)", ret_before); + + /* Rapidly create and destroy namespaces with multiple types */ + for (i = 0; i < 50; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create multiple namespace types */ + if (setup_userns() < 0) + exit(1); + + /* Create additional namespace types */ + if (unshare(CLONE_NEWNET) < 0) + exit(1); + if (unshare(CLONE_NEWUTS) < 0) + exit(1); + if (unshare(CLONE_NEWIPC) < 0) + exit(1); + + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 50 rapid mixed namespace cycles: %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test nested namespace creation under stress. + * Create deeply nested namespace hierarchies and verify proper cleanup. + */ +TEST(nested_namespace_stress) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + /* Create 20 processes, each with nested user namespaces */ + for (i = 0; i < 20; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int userns_fd; + uid_t orig_uid = getuid(); + int depth; + + /* Create nested user namespaces (up to 5 levels) */ + for (depth = 0; depth < 5; depth++) { + userns_fd = get_userns_fd(0, (depth == 0) ? orig_uid : 0, 1); + if (userns_fd < 0) + exit(1); + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + exit(1); + } + close(userns_fd); + } + + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 20 nested namespace hierarchies: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test listns() pagination under stress. + * Create many namespaces and verify pagination works correctly. + */ +TEST(listns_pagination_stress) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + pid_t pids[30]; + int num_children = 30; + int i; + int sv[2]; + __u64 all_ns_ids[512]; + int total_found = 0; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create many children with user namespaces */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + char c; + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + /* If we fail to read, kill all children and exit */ + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* Paginate through all namespaces using small batch sizes */ + req.ns_id = 0; + while (1) { + __u64 batch[5]; /* Small batch size to force pagination */ + ssize_t ret; + + ret = sys_listns(&req, batch, ARRAY_SIZE(batch), 0); + if (ret < 0) { + if (errno == ENOSYS) { + close(sv[0]); + for (i = 0; i < num_children; i++) + kill(pids[i], SIGKILL); + for (i = 0; i < num_children; i++) + waitpid(pids[i], NULL, 0); + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + if (ret == 0) + break; + + /* Store results */ + for (i = 0; i < ret && total_found < 512; i++) { + all_ns_ids[total_found++] = batch[i]; + } + + /* Update cursor for next batch */ + if (ret == ARRAY_SIZE(batch)) + req.ns_id = batch[ret - 1]; + else + break; + } + + TH_LOG("Paginated through %d user namespaces", total_found); + + /* Verify no duplicates in pagination */ + for (i = 0; i < total_found; i++) { + for (int j = i + 1; j < total_found; j++) { + if (all_ns_ids[i] == all_ns_ids[j]) { + TH_LOG("Found duplicate ns_id: %llu at positions %d and %d", + (unsigned long long)all_ns_ids[i], i, j); + ASSERT_TRUE(false); + } + } + } + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Wait for all children */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + } +} + +/* + * Test concurrent namespace operations. + * Multiple processes creating, querying, and destroying namespaces concurrently. + */ +TEST(concurrent_namespace_operations) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + pid_t pids[20]; + int num_workers = 20; + int i; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces", ret_before); + + /* Create worker processes that do concurrent operations */ + for (i = 0; i < num_workers; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + /* Each worker: create namespaces, list them, repeat */ + int iterations; + + for (iterations = 0; iterations < 10; iterations++) { + int userns_fd; + __u64 temp_ns_ids[100]; + ssize_t ret; + + /* Create a user namespace */ + userns_fd = get_userns_fd(0, getuid(), 1); + if (userns_fd < 0) + continue; + + /* List namespaces */ + ret = sys_listns(&req, temp_ns_ids, ARRAY_SIZE(temp_ns_ids), 0); + (void)ret; + + close(userns_fd); + + /* Small delay */ + usleep(1000); + } + + exit(0); + } + } + + /* Wait for all workers */ + for (i = 0; i < num_workers; i++) { + int status; + waitpid(pids[i], &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After concurrent operations: %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test namespace churn - continuous creation and destruction. + * Simulates high-churn scenarios like container orchestration. + */ +TEST(namespace_churn) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int cycle; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces", ret_before); + + /* Simulate churn: batches of namespaces created and destroyed */ + for (cycle = 0; cycle < 10; cycle++) { + pid_t batch_pids[10]; + int i; + + /* Create batch */ + for (i = 0; i < 10; i++) { + batch_pids[i] = fork(); + ASSERT_GE(batch_pids[i], 0); + + if (batch_pids[i] == 0) { + /* Create multiple namespace types */ + if (setup_userns() < 0) + exit(1); + if (unshare(CLONE_NEWNET) < 0) + exit(1); + if (unshare(CLONE_NEWUTS) < 0) + exit(1); + + /* Keep namespaces alive briefly */ + usleep(10000); + exit(0); + } + } + + /* Wait for batch to complete */ + for (i = 0; i < 10; i++) { + int status; + waitpid(batch_pids[i], &status, 0); + } + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 10 churn cycles (100 namespace sets): %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/wrappers.h b/tools/testing/selftests/namespaces/wrappers.h new file mode 100644 index 000000000000..9741a64a5b1d --- /dev/null +++ b/tools/testing/selftests/namespaces/wrappers.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/nsfs.h> +#include <linux/types.h> +#include <sys/syscall.h> +#include <unistd.h> + +#ifndef __SELFTESTS_NAMESPACES_WRAPPERS_H__ +#define __SELFTESTS_NAMESPACES_WRAPPERS_H__ + +#ifndef __NR_listns + #if defined __alpha__ + #define __NR_listns 580 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_listns 4470 + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_listns 6470 + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_listns 5470 + #endif + #else + #define __NR_listns 470 + #endif +#endif + +static inline int sys_listns(const struct ns_id_req *req, __u64 *ns_ids, + size_t nr_ns_ids, unsigned int flags) +{ + return syscall(__NR_listns, req, ns_ids, nr_ns_ids, flags); +} + +#endif /* __SELFTESTS_NAMESPACES_WRAPPERS_H__ */ diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 439101b518ee..8f9850a71f54 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -45,6 +45,7 @@ skf_net_off socket so_incoming_cpu so_netns_cookie +so_peek_off so_txtime so_rcv_listener stress_reuseport_listen diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile index de805cbbdf69..528d14c598bb 100644 --- a/tools/testing/selftests/net/af_unix/Makefile +++ b/tools/testing/selftests/net/af_unix/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := \ scm_inq \ scm_pidfd \ scm_rights \ + so_peek_off \ unix_connect \ # end of TEST_GEN_PROGS diff --git a/tools/testing/selftests/net/af_unix/so_peek_off.c b/tools/testing/selftests/net/af_unix/so_peek_off.c new file mode 100644 index 000000000000..1a77728128e5 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/so_peek_off.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2025 Google LLC */ + +#include <stdlib.h> +#include <unistd.h> + +#include <sys/socket.h> + +#include "../../kselftest_harness.h" + +FIXTURE(so_peek_off) +{ + int fd[2]; /* 0: sender, 1: receiver */ +}; + +FIXTURE_VARIANT(so_peek_off) +{ + int type; +}; + +FIXTURE_VARIANT_ADD(so_peek_off, stream) +{ + .type = SOCK_STREAM, +}; + +FIXTURE_VARIANT_ADD(so_peek_off, dgram) +{ + .type = SOCK_DGRAM, +}; + +FIXTURE_VARIANT_ADD(so_peek_off, seqpacket) +{ + .type = SOCK_SEQPACKET, +}; + +FIXTURE_SETUP(so_peek_off) +{ + struct timeval timeout = { + .tv_sec = 0, + .tv_usec = 3000, + }; + int ret; + + ret = socketpair(AF_UNIX, variant->type, 0, self->fd); + ASSERT_EQ(0, ret); + + ret = setsockopt(self->fd[1], SOL_SOCKET, SO_RCVTIMEO_NEW, + &timeout, sizeof(timeout)); + ASSERT_EQ(0, ret); + + ret = setsockopt(self->fd[1], SOL_SOCKET, SO_PEEK_OFF, + &(int){0}, sizeof(int)); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(so_peek_off) +{ + close_range(self->fd[0], self->fd[1], 0); +} + +#define sendeq(fd, str, flags) \ + do { \ + int bytes, len = strlen(str); \ + \ + bytes = send(fd, str, len, flags); \ + ASSERT_EQ(len, bytes); \ + } while (0) + +#define recveq(fd, str, buflen, flags) \ + do { \ + char buf[(buflen) + 1] = {}; \ + int bytes; \ + \ + bytes = recv(fd, buf, buflen, flags); \ + ASSERT_NE(-1, bytes); \ + ASSERT_STREQ(str, buf); \ + } while (0) + +#define async \ + for (pid_t pid = (pid = fork(), \ + pid < 0 ? \ + __TH_LOG("Failed to start async {}"), \ + _metadata->exit_code = KSFT_FAIL, \ + __bail(1, _metadata), \ + 0xdead : \ + pid); \ + !pid; exit(0)) + +TEST_F(so_peek_off, single_chunk) +{ + sendeq(self->fd[0], "aaaabbbb", 0); + + recveq(self->fd[1], "aaaa", 4, MSG_PEEK); + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_F(so_peek_off, two_chunks) +{ + sendeq(self->fd[0], "aaaa", 0); + sendeq(self->fd[0], "bbbb", 0); + + recveq(self->fd[1], "aaaa", 4, MSG_PEEK); + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_F(so_peek_off, two_chunks_blocking) +{ + async { + usleep(1000); + sendeq(self->fd[0], "aaaa", 0); + } + + recveq(self->fd[1], "aaaa", 4, MSG_PEEK); + + async { + usleep(1000); + sendeq(self->fd[0], "bbbb", 0); + } + + /* goto again; -> goto redo; in unix_stream_read_generic(). */ + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_F(so_peek_off, two_chunks_overlap) +{ + sendeq(self->fd[0], "aaaa", 0); + recveq(self->fd[1], "aa", 2, MSG_PEEK); + + sendeq(self->fd[0], "bbbb", 0); + + if (variant->type == SOCK_STREAM) { + /* SOCK_STREAM tries to fill the buffer. */ + recveq(self->fd[1], "aabb", 4, MSG_PEEK); + recveq(self->fd[1], "bb", 100, MSG_PEEK); + } else { + /* SOCK_DGRAM and SOCK_SEQPACKET returns at the skb boundary. */ + recveq(self->fd[1], "aa", 100, MSG_PEEK); + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); + } +} + +TEST_F(so_peek_off, two_chunks_overlap_blocking) +{ + async { + usleep(1000); + sendeq(self->fd[0], "aaaa", 0); + } + + recveq(self->fd[1], "aa", 2, MSG_PEEK); + + async { + usleep(1000); + sendeq(self->fd[0], "bbbb", 0); + } + + /* Even SOCK_STREAM does not wait if at least one byte is read. */ + recveq(self->fd[1], "aa", 100, MSG_PEEK); + + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/bareudp.sh b/tools/testing/selftests/net/bareudp.sh index 4046131e7888..d9e5b967f815 100755 --- a/tools/testing/selftests/net/bareudp.sh +++ b/tools/testing/selftests/net/bareudp.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # Test various bareudp tunnel configurations. diff --git a/tools/testing/selftests/net/forwarding/lib_sh_test.sh b/tools/testing/selftests/net/forwarding/lib_sh_test.sh index ff2accccaf4d..b4eda6c6199e 100755 --- a/tools/testing/selftests/net/forwarding/lib_sh_test.sh +++ b/tools/testing/selftests/net/forwarding/lib_sh_test.sh @@ -30,6 +30,11 @@ tfail() do_test "tfail" false } +tfail2() +{ + do_test "tfail2" false +} + txfail() { FAIL_TO_XFAIL=yes do_test "txfail" false @@ -132,6 +137,8 @@ test_ret() ret_subtest $ksft_fail "tfail" txfail tfail ret_subtest $ksft_xfail "txfail" txfail txfail + + ret_subtest $ksft_fail "tfail2" tfail2 tfail } exit_status_tests_run() diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index ecd34f364125..892895659c7e 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -176,6 +176,8 @@ run_test() local rcv_dmac=$(mac_get $rcv_if_name) local should_receive + setup_wait + tcpdump_start $rcv_if_name mc_route_prepare $send_if_name diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c index 2b1d9f2b3e9e..cfc39f70635d 100644 --- a/tools/testing/selftests/net/gro.c +++ b/tools/testing/selftests/net/gro.c @@ -754,11 +754,11 @@ static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, static char exthdr_pck[sizeof(buf) + MIN_EXTHDR_SIZE]; create_packet(buf, 0, 0, PAYLOAD_LEN, 0); - add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS, ext_data1); + add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_DSTOPTS, ext_data1); write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr); create_packet(buf, PAYLOAD_LEN * 1, 0, PAYLOAD_LEN, 0); - add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS, ext_data2); + add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_DSTOPTS, ext_data2); write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr); } @@ -989,6 +989,7 @@ static void check_recv_pkts(int fd, int *correct_payload, static void gro_sender(void) { + const int fin_delay_us = 100 * 1000; static char fin_pkt[MAX_HDR_LEN]; struct sockaddr_ll daddr = {}; int txfd = -1; @@ -1032,15 +1033,22 @@ static void gro_sender(void) write_packet(txfd, fin_pkt, total_hdr_len, &daddr); } else if (strcmp(testname, "tcp") == 0) { send_changed_checksum(txfd, &daddr); + /* Adding sleep before sending FIN so that it is not + * received prior to other packets. + */ + usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); send_changed_seq(txfd, &daddr); + usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); send_changed_ts(txfd, &daddr); + usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); send_diff_opt(txfd, &daddr); + usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); } else if (strcmp(testname, "ip") == 0) { send_changed_ECN(txfd, &daddr); diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index feba4ef69a54..f448bafb3f20 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -43,7 +43,7 @@ __ksft_status_merge() weights[$i]=$((weight++)) done - if [[ ${weights[$a]} > ${weights[$b]} ]]; then + if [[ ${weights[$a]} -ge ${weights[$b]} ]]; then echo "$a" return 0 else diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index b148cadb96d0..fc7e22b503d3 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -710,8 +710,14 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bw = do_rnd_write(peerfd, winfo->buf + winfo->off, winfo->len); if (bw < 0) { - if (cfg_rcv_trunc) - return 0; + /* expected reset, continue to read */ + if (cfg_rcv_trunc && + (errno == ECONNRESET || + errno == EPIPE)) { + fds.events &= ~POLLOUT; + continue; + } + perror("write"); return 111; } @@ -737,8 +743,10 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, } if (fds.revents & (POLLERR | POLLNVAL)) { - if (cfg_rcv_trunc) - return 0; + if (cfg_rcv_trunc) { + fds.events &= ~(POLLERR | POLLNVAL); + continue; + } fprintf(stderr, "Unexpected revents: " "POLLERR/POLLNVAL(%x)\n", fds.revents); return 5; @@ -1433,7 +1441,7 @@ static void parse_opts(int argc, char **argv) */ if (cfg_truncate < 0) { cfg_rcv_trunc = true; - signal(SIGPIPE, handle_signal); + signal(SIGPIPE, SIG_IGN); } break; case 'j': diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 47ecb5b3836e..9b7b93f8eb0c 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -492,7 +492,7 @@ do_transfer() "than expected (${expect_synrx})" retc=1 fi - if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ] && [ ${stat_ooo_now} -eq 0 ]; then + if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ]; then if [ ${stat_ooo_now} -eq 0 ]; then mptcp_lib_pr_fail "lower MPC ACK rx (${stat_ackrx_now_l})" \ "than expected (${expect_ackrx})" diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 78a1aa4ecff2..43f31f8d587f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2532,7 +2532,7 @@ remove_tests() if reset "remove single subflow"; then pm_nl_set_limits $ns1 0 1 pm_nl_set_limits $ns2 0 1 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup addr_nr_ns2=-1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 @@ -2545,8 +2545,8 @@ remove_tests() if reset "remove multiple subflows"; then pm_nl_set_limits $ns1 0 2 pm_nl_set_limits $ns2 0 2 - pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup addr_nr_ns2=-2 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 @@ -2557,7 +2557,7 @@ remove_tests() # single address, remove if reset "remove single address"; then pm_nl_set_limits $ns1 0 1 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 1 addr_nr_ns1=-1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 @@ -2570,9 +2570,9 @@ remove_tests() # subflow and signal, remove if reset "remove subflow and signal"; then pm_nl_set_limits $ns1 0 2 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 2 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup addr_nr_ns1=-1 addr_nr_ns2=-1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 @@ -2584,10 +2584,10 @@ remove_tests() # subflows and signal, remove if reset "remove subflows and signal"; then pm_nl_set_limits $ns1 0 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 3 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup addr_nr_ns1=-1 addr_nr_ns2=-2 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 @@ -2599,9 +2599,9 @@ remove_tests() # addresses remove if reset "remove addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250 - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal - pm_nl_add_endpoint $ns1 10.0.4.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup id 250 + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.4.1 flags signal,backup pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-3 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 @@ -2614,10 +2614,10 @@ remove_tests() # invalid addresses remove if reset "remove invalid addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.12.1 flags signal + pm_nl_add_endpoint $ns1 10.0.12.1 flags signal,backup # broadcast IP: no packet for this address will be received on ns1 - pm_nl_add_endpoint $ns1 224.0.0.1 flags signal - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal + pm_nl_add_endpoint $ns1 224.0.0.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup pm_nl_set_limits $ns2 2 2 addr_nr_ns1=-3 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 @@ -2631,10 +2631,10 @@ remove_tests() # subflows and signal, flush if reset "flush subflows and signal"; then pm_nl_set_limits $ns1 0 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 3 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 @@ -2647,9 +2647,9 @@ remove_tests() if reset "flush subflows"; then pm_nl_set_limits $ns1 3 3 pm_nl_set_limits $ns2 3 3 - pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow id 150 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,backup id 150 + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 @@ -2666,9 +2666,9 @@ remove_tests() # addresses flush if reset "flush addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250 - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal - pm_nl_add_endpoint $ns1 10.0.4.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup id 250 + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.4.1 flags signal,backup pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 @@ -2681,9 +2681,9 @@ remove_tests() # invalid addresses flush if reset "flush invalid addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.12.1 flags signal - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal - pm_nl_add_endpoint $ns1 10.0.14.1 flags signal + pm_nl_add_endpoint $ns1 10.0.12.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.14.1 flags signal,backup pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 @@ -3500,7 +3500,6 @@ fullmesh_tests() fastclose_tests() { if reset_check_counter "fastclose test" "MPTcpExtMPFastcloseTx"; then - MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=client \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 @@ -3509,7 +3508,6 @@ fastclose_tests() fi if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then - MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=server \ run_tests $ns1 $ns2 10.0.1.1 join_rst_nr=1 \ @@ -3806,7 +3804,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 2 2 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 @@ -3831,7 +3829,7 @@ userspace_tests() chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm create destroy subflow @@ -3839,7 +3837,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3859,7 +3857,7 @@ userspace_tests() chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm create id 0 subflow @@ -3867,7 +3865,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3880,7 +3878,7 @@ userspace_tests() chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 2 2 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm remove initial subflow @@ -3888,7 +3886,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3904,7 +3902,7 @@ userspace_tests() chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm send RM_ADDR for ID 0 @@ -3912,7 +3910,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 1 1 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 @@ -3930,7 +3928,7 @@ userspace_tests() chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi } @@ -3943,7 +3941,7 @@ endpoint_tests() pm_nl_set_limits $ns1 2 2 pm_nl_set_limits $ns2 2 2 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal - { speed=slow \ + { timeout_test=120 test_linkfail=128 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -3960,7 +3958,7 @@ endpoint_tests() pm_nl_add_endpoint $ns2 10.0.2.2 flags signal pm_nl_check_endpoint "modif is allowed" \ $ns2 10.0.2.2 id 1 flags signal - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT && @@ -3970,7 +3968,7 @@ endpoint_tests() pm_nl_set_limits $ns2 0 3 pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - { test_linkfail=4 speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -4015,7 +4013,7 @@ endpoint_tests() chk_mptcp_info subflows 3 subflows 3 done - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid kill_events_pids chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 @@ -4048,7 +4046,7 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal - { test_linkfail=4 speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -4057,39 +4055,46 @@ endpoint_tests() $ns1 10.0.2.1 id 1 flags signal chk_subflow_nr "before delete" 2 chk_mptcp_info subflows 1 subflows 1 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 1 pm_nl_del_endpoint $ns1 1 10.0.2.1 pm_nl_del_endpoint $ns1 2 224.0.0.1 sleep 0.5 chk_subflow_nr "after delete" 1 chk_mptcp_info subflows 0 subflows 0 + chk_mptcp_info add_addr_signal 0 add_addr_accepted 0 pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal wait_mpj $ns2 chk_subflow_nr "after re-add" 3 chk_mptcp_info subflows 2 subflows 2 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_del_endpoint $ns1 42 10.0.1.1 sleep 0.5 chk_subflow_nr "after delete ID 0" 2 chk_mptcp_info subflows 2 subflows 2 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal wait_mpj $ns2 chk_subflow_nr "after re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 + chk_mptcp_info add_addr_signal 3 add_addr_accepted 2 pm_nl_del_endpoint $ns1 99 10.0.1.1 sleep 0.5 chk_subflow_nr "after re-delete ID 0" 2 chk_mptcp_info subflows 2 subflows 2 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_add_endpoint $ns1 10.0.1.1 id 88 flags signal wait_mpj $ns2 chk_subflow_nr "after re-re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 - mptcp_lib_kill_wait $tests_pid + chk_mptcp_info add_addr_signal 3 add_addr_accepted 2 + mptcp_lib_kill_group_wait $tests_pid kill_events_pids chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 @@ -4121,7 +4126,7 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow - { test_linkfail=4 speed=20 \ + { timeout_test=120 test_linkfail=128 speed=20 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -4137,7 +4142,7 @@ endpoint_tests() wait_mpj $ns2 pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal wait_mpj $ns2 - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid join_syn_tx=3 join_connect_err=1 \ chk_join_nr 2 2 2 diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index d62e653d48b0..f4388900016a 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -350,6 +350,27 @@ mptcp_lib_kill_wait() { wait "${1}" 2>/dev/null } +# $1: PID +mptcp_lib_pid_list_children() { + local curr="${1}" + # evoke 'ps' only once + local pids="${2:-"$(ps o pid,ppid)"}" + + echo "${curr}" + + local pid + for pid in $(echo "${pids}" | awk "\$2 == ${curr} { print \$1 }"); do + mptcp_lib_pid_list_children "${pid}" "${pids}" + done +} + +# $1: PID +mptcp_lib_kill_group_wait() { + # Some users might not have procps-ng: cannot use "kill -- -PID" + mptcp_lib_pid_list_children "${1}" | xargs -r kill &>/dev/null + wait "${1}" 2>/dev/null +} + # $1: IP address mptcp_lib_is_v6() { [ -z "${1##*:*}" ] diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index 330e000baeb1..9416ae952e18 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -87,7 +87,6 @@ IMAGE_riscv = arch/riscv/boot/Image IMAGE_riscv32 = arch/riscv/boot/Image IMAGE_riscv64 = arch/riscv/boot/Image IMAGE_s390x = arch/s390/boot/bzImage -IMAGE_s390 = arch/s390/boot/bzImage IMAGE_loongarch = arch/loongarch/boot/vmlinuz.efi IMAGE_sparc32 = arch/sparc/boot/image IMAGE_sparc64 = arch/sparc/boot/image @@ -117,7 +116,6 @@ DEFCONFIG_riscv = defconfig DEFCONFIG_riscv32 = rv32_defconfig DEFCONFIG_riscv64 = defconfig DEFCONFIG_s390x = defconfig -DEFCONFIG_s390 = defconfig compat.config DEFCONFIG_loongarch = defconfig DEFCONFIG_sparc32 = sparc32_defconfig DEFCONFIG_sparc64 = sparc64_defconfig @@ -156,7 +154,6 @@ QEMU_ARCH_riscv = riscv64 QEMU_ARCH_riscv32 = riscv32 QEMU_ARCH_riscv64 = riscv64 QEMU_ARCH_s390x = s390x -QEMU_ARCH_s390 = s390x QEMU_ARCH_loongarch = loongarch64 QEMU_ARCH_sparc32 = sparc QEMU_ARCH_sparc64 = sparc64 @@ -197,7 +194,6 @@ QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_T QEMU_ARGS_riscv32 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_riscv64 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_s390x = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS_s390 = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_loongarch = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_sparc32 = -M SS-5 -m 256M -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_sparc64 = -M sun4u -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" @@ -223,7 +219,6 @@ CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2) CFLAGS_s390x = -m64 -CFLAGS_s390 = -m31 CFLAGS_mips32le = -EL -mabi=32 -fPIC CFLAGS_mips32be = -EB -mabi=32 CFLAGS_mipsn32le = -EL -mabi=n32 -fPIC -march=mips64r2 diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index e8af1fb505cf..210abe715ed9 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -23,7 +23,7 @@ all_archs=( mips32le mips32be mipsn32le mipsn32be mips64le mips64be ppc ppc64 ppc64le riscv32 riscv64 - s390x s390 + s390x loongarch sparc32 sparc64 m68k @@ -185,10 +185,6 @@ test_arch() { exit 1 esac printf '%-15s' "$arch:" - if [ "$arch" = "s390" ] && ([ "$llvm" = "1" ] || [ "$test_mode" = "user" ]); then - echo "Unsupported configuration" - return - fi if [ "$arch" = "m68k" -o "$arch" = "sh4" ] && [ "$llvm" = "1" ]; then echo "Unsupported configuration" return diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index f87993def738..d60f10a873bb 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -148,6 +148,14 @@ #define PIDFD_INFO_COREDUMP (1UL << 4) #endif +#ifndef PIDFD_INFO_SUPPORTED_MASK +#define PIDFD_INFO_SUPPORTED_MASK (1UL << 5) +#endif + +#ifndef PIDFD_INFO_COREDUMP_SIGNAL +#define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6) +#endif + #ifndef PIDFD_COREDUMPED #define PIDFD_COREDUMPED (1U << 0) /* Did crash and... */ #endif @@ -183,8 +191,11 @@ struct pidfd_info { __u32 fsuid; __u32 fsgid; __s32 exit_code; - __u32 coredump_mask; - __u32 __spare1; + struct { + __u32 coredump_mask; + __u32 coredump_signal; + }; + __u64 supported_mask; }; /* diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c index a0eb6e81eaa2..cb5430a2fd75 100644 --- a/tools/testing/selftests/pidfd/pidfd_info_test.c +++ b/tools/testing/selftests/pidfd/pidfd_info_test.c @@ -690,4 +690,77 @@ TEST_F(pidfd_info, thread_group_exec_thread) EXPECT_EQ(close(pidfd_thread), 0); } +/* + * Test: PIDFD_INFO_SUPPORTED_MASK field + * + * Verify that when PIDFD_INFO_SUPPORTED_MASK is requested, the kernel + * returns the supported_mask field indicating which flags the kernel supports. + */ +TEST(supported_mask_field) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_SUPPORTED_MASK, + }; + int pidfd; + pid_t pid; + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + + if (pid == 0) + pause(); + + /* Request supported_mask field */ + ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0); + + /* Verify PIDFD_INFO_SUPPORTED_MASK is set in the reply */ + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK)); + + /* Verify supported_mask contains expected flags */ + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_PID)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CGROUPID)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_EXIT)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_SUPPORTED_MASK)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP_SIGNAL)); + + /* Clean up */ + sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0); + sys_waitid(P_PIDFD, pidfd, NULL, WEXITED); + close(pidfd); +} + +/* + * Test: PIDFD_INFO_SUPPORTED_MASK always available + * + * Verify that supported_mask is returned even when other fields are requested. + */ +TEST(supported_mask_with_other_fields) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_SUPPORTED_MASK, + }; + int pidfd; + pid_t pid; + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + + if (pid == 0) + pause(); + + ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0); + + /* Both fields should be present */ + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CGROUPID)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK)); + ASSERT_NE(info.supported_mask, 0); + + /* Clean up */ + sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0); + sys_waitid(P_PIDFD, pidfd, NULL, WEXITED); + close(pidfd); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h index 33baaa9f9997..e7b858cd3736 100644 --- a/tools/testing/selftests/rseq/rseq-s390.h +++ b/tools/testing/selftests/rseq/rseq-s390.h @@ -28,8 +28,6 @@ do { \ RSEQ_WRITE_ONCE(*(p), v); \ } while (0) -#ifdef __s390x__ - #define LONG_L "lg" #define LONG_S "stg" #define LONG_LT_R "ltgr" @@ -63,43 +61,6 @@ do { \ ".quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n\t" \ ".popsection\n\t" -#elif __s390__ - -#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \ - start_ip, post_commit_offset, abort_ip) \ - ".pushsection __rseq_cs, \"aw\"\n\t" \ - ".balign 32\n\t" \ - __rseq_str(label) ":\n\t" \ - ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \ - ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \ - ".popsection\n\t" \ - ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \ - ".long 0x0, " __rseq_str(label) "b\n\t" \ - ".popsection\n\t" - -/* - * Exit points of a rseq critical section consist of all instructions outside - * of the critical section where a critical section can either branch to or - * reach through the normal course of its execution. The abort IP and the - * post-commit IP are already part of the __rseq_cs section and should not be - * explicitly defined as additional exit points. Knowing all exit points is - * useful to assist debuggers stepping over the critical section. - */ -#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \ - ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \ - ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) "\n\t" \ - ".popsection\n\t" - -#define LONG_L "l" -#define LONG_S "st" -#define LONG_LT_R "ltr" -#define LONG_CMP "c" -#define LONG_CMP_R "cr" -#define LONG_ADDI "ahi" -#define LONG_ADD_R "ar" - -#endif - #define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \ (post_commit_ip - start_ip), abort_ip) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 998e5a2f4579..0091bcd91c2c 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -961,5 +961,49 @@ "teardown": [ "$TC qdisc del dev $DUMMY root" ] + }, + { + "id": "4989", + "name": "Try to add an fq child to an ingress qdisc", + "category": [ + "qdisc", + "ingress" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DUMMY handle ffff:0 ingress" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY parent ffff:0 handle ffe0:0 fq", + "expExitCode": "2", + "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle ffe0:", + "matchJSON": [], + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY ingress" + ] + }, + { + "id": "c2b0", + "name": "Try to add an fq child to a clsact qdisc", + "category": [ + "qdisc", + "ingress" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DUMMY handle ffff:0 clsact" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY parent ffff:0 handle ffe0:0 fq", + "expExitCode": "2", + "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle ffe0:", + "matchJSON": [], + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY clsact" + ] } ] diff --git a/tools/testing/selftests/timers/nanosleep.c b/tools/testing/selftests/timers/nanosleep.c index 252c6308c569..10badae13ebe 100644 --- a/tools/testing/selftests/timers/nanosleep.c +++ b/tools/testing/selftests/timers/nanosleep.c @@ -116,6 +116,56 @@ int nanosleep_test(int clockid, long long ns) return 0; } +static void dummy_event_handler(int val) +{ + /* No action needed */ +} + +static int nanosleep_test_remaining(int clockid) +{ + struct timespec rqtp = {}, rmtp = {}; + struct itimerspec itimer = {}; + struct sigaction sa = {}; + timer_t timer; + int ret; + + sa.sa_handler = dummy_event_handler; + ret = sigaction(SIGALRM, &sa, NULL); + if (ret) + return -1; + + ret = timer_create(clockid, NULL, &timer); + if (ret) + return -1; + + itimer.it_value.tv_nsec = NSEC_PER_SEC / 4; + ret = timer_settime(timer, 0, &itimer, NULL); + if (ret) + return -1; + + rqtp.tv_nsec = NSEC_PER_SEC / 2; + ret = clock_nanosleep(clockid, 0, &rqtp, &rmtp); + if (ret != EINTR) + return -1; + + ret = timer_delete(timer); + if (ret) + return -1; + + sa.sa_handler = SIG_DFL; + ret = sigaction(SIGALRM, &sa, NULL); + if (ret) + return -1; + + if (!in_order((struct timespec) {}, rmtp)) + return -1; + + if (!in_order(rmtp, rqtp)) + return -1; + + return 0; +} + int main(int argc, char **argv) { long long length; @@ -150,6 +200,11 @@ int main(int argc, char **argv) } length *= 100; } + ret = nanosleep_test_remaining(clockid); + if (ret < 0) { + ksft_test_result_fail("%-31s\n", clockstring(clockid)); + ksft_exit_fail(); + } ksft_test_result_pass("%-31s\n", clockstring(clockid)); next: ret = 0; diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index f0eceb0faf34..a563c438ac79 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -18,6 +18,7 @@ #include <time.h> #include <include/vdso/time64.h> #include <pthread.h> +#include <stdbool.h> #include "../kselftest.h" @@ -670,8 +671,14 @@ static void check_timer_create_exact(void) int main(int argc, char **argv) { + bool run_sig_ign_tests = ksft_min_kernel_version(6, 13); + ksft_print_header(); - ksft_set_plan(19); + if (run_sig_ign_tests) { + ksft_set_plan(19); + } else { + ksft_set_plan(10); + } ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("based timers if other threads run on the CPU...\n"); @@ -695,15 +702,20 @@ int main(int argc, char **argv) check_timer_create(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_timer_distribution(); - check_sig_ign(0); - check_sig_ign(1); - check_rearm(); - check_delete(); - check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); - check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); - check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); - check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); - check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); + if (run_sig_ign_tests) { + check_sig_ign(0); + check_sig_ign(1); + check_rearm(); + check_delete(); + check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); + } else { + ksft_print_msg("Skipping SIG_IGN tests on kernel < 6.13\n"); + } + check_overrun(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); check_overrun(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_overrun(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c index 5288e768b207..68625362add2 100644 --- a/tools/testing/selftests/user_events/perf_test.c +++ b/tools/testing/selftests/user_events/perf_test.c @@ -236,7 +236,7 @@ TEST_F(user, perf_empty_events) { ASSERT_EQ(1 << reg.enable_bit, self->check); /* Ensure write shows up at correct offset */ - ASSERT_NE(-1, write(self->data_fd, ®.write_index, + ASSERT_NE(-1, write(self->data_fd, (void *)®.write_index, sizeof(reg.write_index))); val = (void *)(((char *)perf_page) + perf_page->data_offset); ASSERT_EQ(PERF_RECORD_SAMPLE, *val); diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h index 5fdd0f362337..50c261005111 100644 --- a/tools/testing/selftests/vDSO/vdso_config.h +++ b/tools/testing/selftests/vDSO/vdso_config.h @@ -25,10 +25,6 @@ #define VDSO_VERSION 1 #define VDSO_NAMES 0 #define VDSO_32BIT 1 -#elif defined (__s390__) && !defined(__s390x__) -#define VDSO_VERSION 2 -#define VDSO_NAMES 0 -#define VDSO_32BIT 1 #elif defined (__s390x__) #define VDSO_VERSION 2 #define VDSO_NAMES 0 diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index ed31606e01b7..69ec0c856481 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -4,9 +4,12 @@ #include <fcntl.h> #include <string.h> -#include <linux/vfio.h> + +#include <uapi/linux/types.h> +#include <linux/iommufd.h> #include <linux/list.h> #include <linux/pci_regs.h> +#include <linux/vfio.h> #include "../../../kselftest.h" @@ -185,6 +188,13 @@ struct vfio_pci_device { struct vfio_pci_driver driver; }; +struct iova_allocator { + struct iommu_iova_range *ranges; + u32 nranges; + u32 range_idx; + u64 range_offset; +}; + /* * Return the BDF string of the device that the test should use. * @@ -206,10 +216,36 @@ struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_ void vfio_pci_device_cleanup(struct vfio_pci_device *device); void vfio_pci_device_reset(struct vfio_pci_device *device); -void vfio_pci_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region); -void vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct vfio_dma_region *region); +struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, + u32 *nranges); + +struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device); +void iova_allocator_cleanup(struct iova_allocator *allocator); +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size); + +int __vfio_pci_dma_map(struct vfio_pci_device *device, + struct vfio_dma_region *region); +int __vfio_pci_dma_unmap(struct vfio_pci_device *device, + struct vfio_dma_region *region, + u64 *unmapped); +int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped); + +static inline void vfio_pci_dma_map(struct vfio_pci_device *device, + struct vfio_dma_region *region) +{ + VFIO_ASSERT_EQ(__vfio_pci_dma_map(device, region), 0); +} + +static inline void vfio_pci_dma_unmap(struct vfio_pci_device *device, + struct vfio_dma_region *region) +{ + VFIO_ASSERT_EQ(__vfio_pci_dma_unmap(device, region, NULL), 0); +} + +static inline void vfio_pci_dma_unmap_all(struct vfio_pci_device *device) +{ + VFIO_ASSERT_EQ(__vfio_pci_dma_unmap_all(device, NULL), 0); +} void vfio_pci_config_access(struct vfio_pci_device *device, bool write, size_t config, size_t size, void *data); diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 0921b2451ba5..b479a359da12 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -2,6 +2,7 @@ #include <dirent.h> #include <fcntl.h> #include <libgen.h> +#include <stdint.h> #include <stdlib.h> #include <string.h> #include <unistd.h> @@ -11,11 +12,12 @@ #include <sys/mman.h> #include <uapi/linux/types.h> +#include <linux/iommufd.h> #include <linux/limits.h> #include <linux/mman.h> +#include <linux/overflow.h> #include <linux/types.h> #include <linux/vfio.h> -#include <linux/iommufd.h> #include "../../../kselftest.h" #include <vfio_util.h> @@ -28,6 +30,249 @@ VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ } while (0) +static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz, + u32 *cap_offset) +{ + struct vfio_info_cap_header *hdr; + + if (!*cap_offset) + return NULL; + + VFIO_ASSERT_LT(*cap_offset, bufsz); + VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr)); + + hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset); + *cap_offset = hdr->next; + + return hdr; +} + +static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info, + u16 cap_id) +{ + struct vfio_info_cap_header *hdr; + u32 cap_offset = info->cap_offset; + u32 max_depth; + u32 depth = 0; + + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) + return NULL; + + if (cap_offset) + VFIO_ASSERT_GE(cap_offset, sizeof(*info)); + + max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr); + + while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) { + depth++; + VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n"); + + if (hdr->id == cap_id) + return hdr; + } + + return NULL; +} + +/* Return buffer including capability chain, if present. Free with free() */ +static struct vfio_iommu_type1_info *vfio_iommu_get_info(struct vfio_pci_device *device) +{ + struct vfio_iommu_type1_info *info; + + info = malloc(sizeof(*info)); + VFIO_ASSERT_NOT_NULL(info); + + *info = (struct vfio_iommu_type1_info) { + .argsz = sizeof(*info), + }; + + ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + info = realloc(info, info->argsz); + VFIO_ASSERT_NOT_NULL(info); + + ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + return info; +} + +/* + * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to + * report iommufd's iommu_iova_range. Free with free(). + */ +static struct iommu_iova_range *vfio_iommu_iova_ranges(struct vfio_pci_device *device, + u32 *nranges) +{ + struct vfio_iommu_type1_info_cap_iova_range *cap_range; + struct vfio_iommu_type1_info *info; + struct vfio_info_cap_header *hdr; + struct iommu_iova_range *ranges = NULL; + + info = vfio_iommu_get_info(device); + hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); + VFIO_ASSERT_NOT_NULL(hdr); + + cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header); + VFIO_ASSERT_GT(cap_range->nr_iovas, 0); + + ranges = calloc(cap_range->nr_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + for (u32 i = 0; i < cap_range->nr_iovas; i++) { + ranges[i] = (struct iommu_iova_range){ + .start = cap_range->iova_ranges[i].start, + .last = cap_range->iova_ranges[i].end, + }; + } + + *nranges = cap_range->nr_iovas; + + free(info); + return ranges; +} + +/* Return iova ranges of the device's IOAS. Free with free() */ +static struct iommu_iova_range *iommufd_iova_ranges(struct vfio_pci_device *device, + u32 *nranges) +{ + struct iommu_iova_range *ranges; + int ret; + + struct iommu_ioas_iova_ranges query = { + .size = sizeof(query), + .ioas_id = device->ioas_id, + }; + + ret = ioctl(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + VFIO_ASSERT_EQ(ret, -1); + VFIO_ASSERT_EQ(errno, EMSGSIZE); + VFIO_ASSERT_GT(query.num_iovas, 0); + + ranges = calloc(query.num_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + query.allowed_iovas = (uintptr_t)ranges; + + ioctl_assert(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + *nranges = query.num_iovas; + + return ranges; +} + +static int iova_range_comp(const void *a, const void *b) +{ + const struct iommu_iova_range *ra = a, *rb = b; + + if (ra->start < rb->start) + return -1; + + if (ra->start > rb->start) + return 1; + + return 0; +} + +/* Return sorted IOVA ranges of the device. Free with free(). */ +struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, + u32 *nranges) +{ + struct iommu_iova_range *ranges; + + if (device->iommufd) + ranges = iommufd_iova_ranges(device, nranges); + else + ranges = vfio_iommu_iova_ranges(device, nranges); + + if (!ranges) + return NULL; + + VFIO_ASSERT_GT(*nranges, 0); + + /* Sort and check that ranges are sane and non-overlapping */ + qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp); + VFIO_ASSERT_LT(ranges[0].start, ranges[0].last); + + for (u32 i = 1; i < *nranges; i++) { + VFIO_ASSERT_LT(ranges[i].start, ranges[i].last); + VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start); + } + + return ranges; +} + +struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device) +{ + struct iova_allocator *allocator; + struct iommu_iova_range *ranges; + u32 nranges; + + ranges = vfio_pci_iova_ranges(device, &nranges); + VFIO_ASSERT_NOT_NULL(ranges); + + allocator = malloc(sizeof(*allocator)); + VFIO_ASSERT_NOT_NULL(allocator); + + *allocator = (struct iova_allocator){ + .ranges = ranges, + .nranges = nranges, + .range_idx = 0, + .range_offset = 0, + }; + + return allocator; +} + +void iova_allocator_cleanup(struct iova_allocator *allocator) +{ + free(allocator->ranges); + free(allocator); +} + +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size) +{ + VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n"); + VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n"); + + for (;;) { + struct iommu_iova_range *range; + iova_t iova, last; + + VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges, + "IOVA allocator out of space\n"); + + range = &allocator->ranges[allocator->range_idx]; + iova = range->start + allocator->range_offset; + + /* Check for sufficient space at the current offset */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + /* Align iova to size */ + iova = last & ~(size - 1); + + /* Check for sufficient space at the aligned iova */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + if (last == range->last) { + allocator->range_idx++; + allocator->range_offset = 0; + } else { + allocator->range_offset = last - range->start + 1; + } + + return iova; + +next_range: + allocator->range_idx++; + allocator->range_offset = 0; + } +} + iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) { struct vfio_dma_region *region; @@ -141,7 +386,7 @@ static void vfio_pci_irq_get(struct vfio_pci_device *device, u32 index, ioctl_assert(device->fd, VFIO_DEVICE_GET_IRQ_INFO, irq_info); } -static void vfio_iommu_dma_map(struct vfio_pci_device *device, +static int vfio_iommu_dma_map(struct vfio_pci_device *device, struct vfio_dma_region *region) { struct vfio_iommu_type1_dma_map args = { @@ -152,10 +397,13 @@ static void vfio_iommu_dma_map(struct vfio_pci_device *device, .size = region->size, }; - ioctl_assert(device->container_fd, VFIO_IOMMU_MAP_DMA, &args); + if (ioctl(device->container_fd, VFIO_IOMMU_MAP_DMA, &args)) + return -errno; + + return 0; } -static void iommufd_dma_map(struct vfio_pci_device *device, +static int iommufd_dma_map(struct vfio_pci_device *device, struct vfio_dma_region *region) { struct iommu_ioas_map args = { @@ -169,54 +417,108 @@ static void iommufd_dma_map(struct vfio_pci_device *device, .ioas_id = device->ioas_id, }; - ioctl_assert(device->iommufd, IOMMU_IOAS_MAP, &args); + if (ioctl(device->iommufd, IOMMU_IOAS_MAP, &args)) + return -errno; + + return 0; } -void vfio_pci_dma_map(struct vfio_pci_device *device, +int __vfio_pci_dma_map(struct vfio_pci_device *device, struct vfio_dma_region *region) { + int ret; + if (device->iommufd) - iommufd_dma_map(device, region); + ret = iommufd_dma_map(device, region); else - vfio_iommu_dma_map(device, region); + ret = vfio_iommu_dma_map(device, region); + + if (ret) + return ret; list_add(®ion->link, &device->dma_regions); + + return 0; } -static void vfio_iommu_dma_unmap(struct vfio_pci_device *device, - struct vfio_dma_region *region) +static int vfio_iommu_dma_unmap(int fd, u64 iova, u64 size, u32 flags, + u64 *unmapped) { struct vfio_iommu_type1_dma_unmap args = { .argsz = sizeof(args), - .iova = region->iova, - .size = region->size, + .iova = iova, + .size = size, + .flags = flags, }; - ioctl_assert(device->container_fd, VFIO_IOMMU_UNMAP_DMA, &args); + if (ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &args)) + return -errno; + + if (unmapped) + *unmapped = args.size; + + return 0; } -static void iommufd_dma_unmap(struct vfio_pci_device *device, - struct vfio_dma_region *region) +static int iommufd_dma_unmap(int fd, u64 iova, u64 length, u32 ioas_id, + u64 *unmapped) { struct iommu_ioas_unmap args = { .size = sizeof(args), - .iova = region->iova, - .length = region->size, - .ioas_id = device->ioas_id, + .iova = iova, + .length = length, + .ioas_id = ioas_id, }; - ioctl_assert(device->iommufd, IOMMU_IOAS_UNMAP, &args); + if (ioctl(fd, IOMMU_IOAS_UNMAP, &args)) + return -errno; + + if (unmapped) + *unmapped = args.length; + + return 0; } -void vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct vfio_dma_region *region) +int __vfio_pci_dma_unmap(struct vfio_pci_device *device, + struct vfio_dma_region *region, u64 *unmapped) { + int ret; + if (device->iommufd) - iommufd_dma_unmap(device, region); + ret = iommufd_dma_unmap(device->iommufd, region->iova, + region->size, device->ioas_id, + unmapped); else - vfio_iommu_dma_unmap(device, region); + ret = vfio_iommu_dma_unmap(device->container_fd, region->iova, + region->size, 0, unmapped); + + if (ret) + return ret; + + list_del_init(®ion->link); + + return 0; +} + +int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped) +{ + int ret; + struct vfio_dma_region *curr, *next; + + if (device->iommufd) + ret = iommufd_dma_unmap(device->iommufd, 0, UINT64_MAX, + device->ioas_id, unmapped); + else + ret = vfio_iommu_dma_unmap(device->container_fd, 0, 0, + VFIO_DMA_UNMAP_FLAG_ALL, unmapped); + + if (ret) + return ret; + + list_for_each_entry_safe(curr, next, &device->dma_regions, link) + list_del_init(&curr->link); - list_del(®ion->link); + return 0; } static void vfio_pci_region_get(struct vfio_pci_device *device, int index, diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index ab19c54a774d..102603d4407d 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -3,6 +3,8 @@ #include <sys/mman.h> #include <unistd.h> +#include <uapi/linux/types.h> +#include <linux/iommufd.h> #include <linux/limits.h> #include <linux/mman.h> #include <linux/sizes.h> @@ -93,6 +95,7 @@ static int iommu_mapping_get(const char *bdf, u64 iova, FIXTURE(vfio_dma_mapping_test) { struct vfio_pci_device *device; + struct iova_allocator *iova_allocator; }; FIXTURE_VARIANT(vfio_dma_mapping_test) { @@ -112,13 +115,17 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous, 0, 0); FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_2mb, SZ_2M, MAP_HUGETLB | MAP_HUGE_2MB); FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_1gb, SZ_1G, MAP_HUGETLB | MAP_HUGE_1GB); +#undef FIXTURE_VARIANT_ADD_IOMMU_MODE + FIXTURE_SETUP(vfio_dma_mapping_test) { self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + self->iova_allocator = iova_allocator_init(self->device); } FIXTURE_TEARDOWN(vfio_dma_mapping_test) { + iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); } @@ -129,6 +136,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) struct vfio_dma_region region; struct iommu_mapping mapping; u64 mapping_size = size; + u64 unmapped; int rc; region.vaddr = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); @@ -139,7 +147,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) else ASSERT_NE(region.vaddr, MAP_FAILED); - region.iova = (u64)region.vaddr; + region.iova = iova_allocator_alloc(self->iova_allocator, size); region.size = size; vfio_pci_dma_map(self->device, ®ion); @@ -184,7 +192,9 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) } unmap: - vfio_pci_dma_unmap(self->device, ®ion); + rc = __vfio_pci_dma_unmap(self->device, ®ion, &unmapped); + ASSERT_EQ(rc, 0); + ASSERT_EQ(unmapped, region.size); printf("Unmapped IOVA 0x%lx\n", region.iova); ASSERT_EQ(INVALID_IOVA, __to_iova(self->device, region.vaddr)); ASSERT_NE(0, iommu_mapping_get(device_bdf, region.iova, &mapping)); @@ -192,6 +202,103 @@ unmap: ASSERT_TRUE(!munmap(region.vaddr, size)); } +FIXTURE(vfio_dma_map_limit_test) { + struct vfio_pci_device *device; + struct vfio_dma_region region; + size_t mmap_size; +}; + +FIXTURE_VARIANT(vfio_dma_map_limit_test) { + const char *iommu_mode; +}; + +#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \ +FIXTURE_VARIANT_ADD(vfio_dma_map_limit_test, _iommu_mode) { \ + .iommu_mode = #_iommu_mode, \ +} + +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); + +#undef FIXTURE_VARIANT_ADD_IOMMU_MODE + +FIXTURE_SETUP(vfio_dma_map_limit_test) +{ + struct vfio_dma_region *region = &self->region; + struct iommu_iova_range *ranges; + u64 region_size = getpagesize(); + iova_t last_iova; + u32 nranges; + + /* + * Over-allocate mmap by double the size to provide enough backing vaddr + * for overflow tests + */ + self->mmap_size = 2 * region_size; + + self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + region->vaddr = mmap(NULL, self->mmap_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + ASSERT_NE(region->vaddr, MAP_FAILED); + + ranges = vfio_pci_iova_ranges(self->device, &nranges); + VFIO_ASSERT_NOT_NULL(ranges); + last_iova = ranges[nranges - 1].last; + free(ranges); + + /* One page prior to the last iova */ + region->iova = last_iova & ~(region_size - 1); + region->size = region_size; +} + +FIXTURE_TEARDOWN(vfio_dma_map_limit_test) +{ + vfio_pci_device_cleanup(self->device); + ASSERT_EQ(munmap(self->region.vaddr, self->mmap_size), 0); +} + +TEST_F(vfio_dma_map_limit_test, unmap_range) +{ + struct vfio_dma_region *region = &self->region; + u64 unmapped; + int rc; + + vfio_pci_dma_map(self->device, region); + ASSERT_EQ(region->iova, to_iova(self->device, region->vaddr)); + + rc = __vfio_pci_dma_unmap(self->device, region, &unmapped); + ASSERT_EQ(rc, 0); + ASSERT_EQ(unmapped, region->size); +} + +TEST_F(vfio_dma_map_limit_test, unmap_all) +{ + struct vfio_dma_region *region = &self->region; + u64 unmapped; + int rc; + + vfio_pci_dma_map(self->device, region); + ASSERT_EQ(region->iova, to_iova(self->device, region->vaddr)); + + rc = __vfio_pci_dma_unmap_all(self->device, &unmapped); + ASSERT_EQ(rc, 0); + ASSERT_EQ(unmapped, region->size); +} + +TEST_F(vfio_dma_map_limit_test, overflow) +{ + struct vfio_dma_region *region = &self->region; + int rc; + + region->iova = ~(iova_t)0 & ~(region->size - 1); + region->size = self->mmap_size; + + rc = __vfio_pci_dma_map(self->device, region); + ASSERT_EQ(rc, -EOVERFLOW); + + rc = __vfio_pci_dma_unmap(self->device, region, NULL); + ASSERT_EQ(rc, -EOVERFLOW); +} + int main(int argc, char *argv[]) { device_bdf = vfio_selftests_get_bdf(&argc, argv); diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c index 2dbd70b7db62..f69eec8b928d 100644 --- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -19,6 +19,7 @@ static const char *device_bdf; } while (0) static void region_setup(struct vfio_pci_device *device, + struct iova_allocator *iova_allocator, struct vfio_dma_region *region, u64 size) { const int flags = MAP_SHARED | MAP_ANONYMOUS; @@ -29,7 +30,7 @@ static void region_setup(struct vfio_pci_device *device, VFIO_ASSERT_NE(vaddr, MAP_FAILED); region->vaddr = vaddr; - region->iova = (u64)vaddr; + region->iova = iova_allocator_alloc(iova_allocator, size); region->size = size; vfio_pci_dma_map(device, region); @@ -44,6 +45,7 @@ static void region_teardown(struct vfio_pci_device *device, FIXTURE(vfio_pci_driver_test) { struct vfio_pci_device *device; + struct iova_allocator *iova_allocator; struct vfio_dma_region memcpy_region; void *vaddr; int msi_fd; @@ -72,14 +74,15 @@ FIXTURE_SETUP(vfio_pci_driver_test) struct vfio_pci_driver *driver; self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + self->iova_allocator = iova_allocator_init(self->device); driver = &self->device->driver; - region_setup(self->device, &self->memcpy_region, SZ_1G); - region_setup(self->device, &driver->region, SZ_2M); + region_setup(self->device, self->iova_allocator, &self->memcpy_region, SZ_1G); + region_setup(self->device, self->iova_allocator, &driver->region, SZ_2M); /* Any IOVA that doesn't overlap memcpy_region and driver->region. */ - self->unmapped_iova = 8UL * SZ_1G; + self->unmapped_iova = iova_allocator_alloc(self->iova_allocator, SZ_1G); vfio_pci_driver_init(self->device); self->msi_fd = self->device->msi_eventfds[driver->msi]; @@ -108,6 +111,7 @@ FIXTURE_TEARDOWN(vfio_pci_driver_test) region_teardown(self->device, &self->memcpy_region); region_teardown(self->device, &driver->region); + iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); } diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index edacebfc1632..8ceeb8a7894f 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -389,9 +389,9 @@ run_test() { local rc host_oops_cnt_before=$(dmesg | grep -c -i 'Oops') - host_warn_cnt_before=$(dmesg --level=warn | wc -l) + host_warn_cnt_before=$(dmesg --level=warn | grep -c -i 'vsock') vm_oops_cnt_before=$(vm_ssh -- dmesg | grep -c -i 'Oops') - vm_warn_cnt_before=$(vm_ssh -- dmesg --level=warn | wc -l) + vm_warn_cnt_before=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock') name=$(echo "${1}" | awk '{ print $1 }') eval test_"${name}" @@ -403,7 +403,7 @@ run_test() { rc=$KSFT_FAIL fi - host_warn_cnt_after=$(dmesg --level=warn | wc -l) + host_warn_cnt_after=$(dmesg --level=warn | grep -c -i 'vsock') if [[ ${host_warn_cnt_after} -gt ${host_warn_cnt_before} ]]; then echo "FAIL: kernel warning detected on host" | log_host "${name}" rc=$KSFT_FAIL @@ -415,7 +415,7 @@ run_test() { rc=$KSFT_FAIL fi - vm_warn_cnt_after=$(vm_ssh -- dmesg --level=warn | wc -l) + vm_warn_cnt_after=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock') if [[ ${vm_warn_cnt_after} -gt ${vm_warn_cnt_before} ]]; then echo "FAIL: kernel warning detected on vm" | log_host "${name}" rc=$KSFT_FAIL diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 05e1e6774fba..918eaec8bfbe 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -308,12 +308,13 @@ static void test_getcpu(int cpu) #ifdef __x86_64__ static jmp_buf jmpbuf; -static volatile unsigned long segv_err; +static volatile unsigned long segv_err, segv_trapno; static void sigsegv(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t *)ctx_void; + segv_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO]; segv_err = ctx->uc_mcontext.gregs[REG_ERR]; siglongjmp(jmpbuf, 1); } @@ -336,7 +337,8 @@ static void test_vsys_r(void) else if (can_read) ksft_test_result_pass("We have read access\n"); else - ksft_test_result_pass("We do not have read access: #PF(0x%lx)\n", segv_err); + ksft_test_result_pass("We do not have read access (trap=%ld, error=0x%lx)\n", + segv_trapno, segv_err); } static void test_vsys_x(void) @@ -347,7 +349,7 @@ static void test_vsys_x(void) return; } - ksft_print_msg("Make sure that vsyscalls really page fault\n"); + ksft_print_msg("Make sure that vsyscalls really cause a fault\n"); bool can_exec; if (sigsetjmp(jmpbuf, 1) == 0) { @@ -358,13 +360,14 @@ static void test_vsys_x(void) } if (can_exec) - ksft_test_result_fail("Executing the vsyscall did not page fault\n"); - else if (segv_err & (1 << 4)) /* INSTR */ - ksft_test_result_pass("Executing the vsyscall page failed: #PF(0x%lx)\n", - segv_err); + ksft_test_result_fail("Executing the vsyscall did not fault\n"); + /* #GP or #PF (with X86_PF_INSTR) */ + else if ((segv_trapno == 13) || ((segv_trapno == 14) && (segv_err & (1 << 4)))) + ksft_test_result_pass("Executing the vsyscall page failed (trap=%ld, error=0x%lx)\n", + segv_trapno, segv_err); else - ksft_test_result_fail("Execution failed with the wrong error: #PF(0x%lx)\n", - segv_err); + ksft_test_result_fail("Execution failed with the wrong error (trap=%ld, error=0x%lx)\n", + segv_trapno, segv_err); } /* |
