summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
Diffstat (limited to 'tools')
-rw-r--r--tools/arch/arm64/include/asm/cputype.h4
-rw-r--r--tools/arch/arm64/include/asm/esr.h4
-rw-r--r--tools/arch/arm64/include/asm/gpr-num.h6
-rw-r--r--tools/arch/arm64/include/asm/sysreg.h12
-rw-r--r--tools/arch/arm64/include/uapi/asm/kvm.h2
-rw-r--r--tools/arch/riscv/include/asm/csr.h5
-rw-r--r--tools/arch/s390/include/uapi/asm/bitsperlong.h4
-rw-r--r--tools/arch/x86/include/asm/cpufeatures.h12
-rw-r--r--tools/arch/x86/include/asm/insn.h5
-rw-r--r--tools/arch/x86/include/asm/msr-index.h20
-rw-r--r--tools/arch/x86/include/uapi/asm/kvm.h34
-rw-r--r--tools/arch/x86/include/uapi/asm/svm.h4
-rw-r--r--tools/arch/x86/include/uapi/asm/vmx.h7
-rw-r--r--tools/arch/x86/tools/gen-cpu-feature-names-x86.awk34
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-prog.rst2
-rw-r--r--tools/build/Build2
-rw-r--r--tools/build/Makefile21
-rw-r--r--tools/build/Makefile.feature4
-rw-r--r--tools/build/feature/Makefile4
-rwxr-xr-xtools/docs/check-variable-fonts.py37
-rwxr-xr-xtools/docs/checktransupdate.py307
-rwxr-xr-xtools/docs/documentation-file-ref-check245
-rwxr-xr-xtools/docs/features-refresh.sh98
-rwxr-xr-xtools/docs/find-unused-docs.sh62
-rwxr-xr-xtools/docs/get_abi.py214
-rwxr-xr-xtools/docs/get_feat.py225
-rwxr-xr-xtools/docs/list-arch.sh11
-rwxr-xr-xtools/docs/parse-headers.py14
-rwxr-xr-xtools/docs/sphinx-build-wrapper864
-rwxr-xr-xtools/docs/sphinx-pre-install1543
-rwxr-xr-xtools/docs/test_doc_build.py513
-rw-r--r--tools/include/asm-generic/bitops/__fls.h2
-rw-r--r--tools/include/asm-generic/bitops/fls.h2
-rw-r--r--tools/include/asm-generic/bitops/fls64.h4
-rw-r--r--tools/include/linux/interval_tree_generic.h10
-rw-r--r--tools/include/linux/livepatch_external.h76
-rw-r--r--tools/include/linux/objtool_types.h2
-rw-r--r--tools/include/linux/string.h14
-rw-r--r--tools/include/nolibc/Makefile22
-rw-r--r--tools/include/nolibc/arch-arm.h2
-rw-r--r--tools/include/nolibc/arch-arm64.h2
-rw-r--r--tools/include/nolibc/arch-loongarch.h2
-rw-r--r--tools/include/nolibc/arch-m68k.h2
-rw-r--r--tools/include/nolibc/arch-mips.h2
-rw-r--r--tools/include/nolibc/arch-powerpc.h2
-rw-r--r--tools/include/nolibc/arch-riscv.h2
-rw-r--r--tools/include/nolibc/arch-s390.h7
-rw-r--r--tools/include/nolibc/arch-sh.h2
-rw-r--r--tools/include/nolibc/arch-sparc.h2
-rw-r--r--tools/include/nolibc/arch-x86.h10
-rw-r--r--tools/include/nolibc/arch.h11
-rw-r--r--tools/include/nolibc/compiler.h4
-rw-r--r--tools/include/nolibc/crt.h3
-rw-r--r--tools/include/nolibc/dirent.h6
-rw-r--r--tools/include/nolibc/getopt.h2
-rw-r--r--tools/include/nolibc/inttypes.h3
-rw-r--r--tools/include/nolibc/nolibc.h2
-rw-r--r--tools/include/nolibc/stackprotector.h2
-rw-r--r--tools/include/nolibc/std.h4
-rw-r--r--tools/include/nolibc/stdio.h10
-rw-r--r--tools/include/nolibc/stdlib.h2
-rw-r--r--tools/include/nolibc/string.h15
-rw-r--r--tools/include/nolibc/sys.h74
-rw-r--r--tools/include/nolibc/sys/auxv.h3
-rw-r--r--tools/include/nolibc/sys/mman.h5
-rw-r--r--tools/include/nolibc/sys/reboot.h2
-rw-r--r--tools/include/nolibc/sys/select.h103
-rw-r--r--tools/include/nolibc/sys/uio.h49
-rw-r--r--tools/include/nolibc/sys/wait.h18
-rw-r--r--tools/include/nolibc/time.h16
-rw-r--r--tools/include/nolibc/types.h47
-rw-r--r--tools/include/nolibc/unistd.h6
-rw-r--r--tools/include/uapi/drm/drm.h63
-rw-r--r--tools/include/uapi/linux/kvm.h3
-rw-r--r--tools/include/uapi/linux/nsfs.h70
-rw-r--r--tools/include/uapi/linux/perf_event.h21
-rw-r--r--tools/lib/bpf/bpf_helpers.h28
-rw-r--r--tools/lib/bpf/bpf_tracing.h2
-rw-r--r--tools/lib/bpf/libbpf.c4
-rw-r--r--tools/lib/bpf/usdt.c2
-rw-r--r--tools/lib/python/__init__.py (renamed from tools/docs/lib/__init__.py)0
-rw-r--r--tools/lib/python/abi/__init__.py0
-rw-r--r--tools/lib/python/abi/abi_parser.py628
-rw-r--r--tools/lib/python/abi/abi_regex.py234
-rw-r--r--tools/lib/python/abi/helpers.py38
-rw-r--r--tools/lib/python/abi/system_symbols.py378
-rwxr-xr-xtools/lib/python/feat/parse_features.py494
-rwxr-xr-xtools/lib/python/jobserver.py149
-rw-r--r--tools/lib/python/kdoc/__init__.py0
-rw-r--r--tools/lib/python/kdoc/enrich_formatter.py (renamed from tools/docs/lib/enrich_formatter.py)0
-rw-r--r--tools/lib/python/kdoc/kdoc_files.py294
-rw-r--r--tools/lib/python/kdoc/kdoc_item.py43
-rw-r--r--tools/lib/python/kdoc/kdoc_output.py824
-rw-r--r--tools/lib/python/kdoc/kdoc_parser.py1670
-rw-r--r--tools/lib/python/kdoc/kdoc_re.py270
-rwxr-xr-xtools/lib/python/kdoc/latex_fonts.py167
-rwxr-xr-xtools/lib/python/kdoc/parse_data_structs.py (renamed from tools/docs/lib/parse_data_structs.py)230
-rw-r--r--tools/lib/python/kdoc/python_version.py178
-rw-r--r--tools/net/ynl/lib/ynl-priv.h4
-rwxr-xr-xtools/net/ynl/pyynl/ethtool.py3
-rwxr-xr-xtools/net/ynl/pyynl/ynl_gen_c.py12
-rw-r--r--tools/objtool/.gitignore3
-rw-r--r--tools/objtool/Build7
-rw-r--r--tools/objtool/Makefile70
-rw-r--r--tools/objtool/arch/loongarch/decode.c29
-rw-r--r--tools/objtool/arch/loongarch/orc.c1
-rw-r--r--tools/objtool/arch/loongarch/special.c5
-rw-r--r--tools/objtool/arch/powerpc/decode.c31
-rw-r--r--tools/objtool/arch/powerpc/special.c5
-rw-r--r--tools/objtool/arch/x86/Build13
-rw-r--r--tools/objtool/arch/x86/decode.c111
-rw-r--r--tools/objtool/arch/x86/orc.c1
-rw-r--r--tools/objtool/arch/x86/special.c12
-rw-r--r--tools/objtool/builtin-check.c102
-rw-r--r--tools/objtool/builtin-klp.c53
-rw-r--r--tools/objtool/check.c1485
-rw-r--r--tools/objtool/disas.c1248
-rw-r--r--tools/objtool/elf.c822
-rw-r--r--tools/objtool/include/objtool/arch.h16
-rw-r--r--tools/objtool/include/objtool/builtin.h14
-rw-r--r--tools/objtool/include/objtool/check.h39
-rw-r--r--tools/objtool/include/objtool/checksum.h43
-rw-r--r--tools/objtool/include/objtool/checksum_types.h25
-rw-r--r--tools/objtool/include/objtool/disas.h81
-rw-r--r--tools/objtool/include/objtool/elf.h198
-rw-r--r--tools/objtool/include/objtool/endianness.h9
-rw-r--r--tools/objtool/include/objtool/klp.h35
-rw-r--r--tools/objtool/include/objtool/objtool.h4
-rw-r--r--tools/objtool/include/objtool/special.h4
-rw-r--r--tools/objtool/include/objtool/trace.h141
-rw-r--r--tools/objtool/include/objtool/util.h19
-rw-r--r--tools/objtool/include/objtool/warn.h66
-rw-r--r--tools/objtool/klp-diff.c1723
-rw-r--r--tools/objtool/klp-post-link.c168
-rw-r--r--tools/objtool/noreturns.h1
-rw-r--r--tools/objtool/objtool.c42
-rw-r--r--tools/objtool/orc_dump.c1
-rw-r--r--tools/objtool/orc_gen.c9
-rw-r--r--tools/objtool/special.c16
-rwxr-xr-xtools/objtool/sync-check.sh2
-rw-r--r--tools/objtool/trace.c203
-rw-r--r--tools/objtool/weak.c7
-rw-r--r--tools/perf/Makefile.config5
-rw-r--r--tools/perf/Makefile.perf6
-rw-r--r--tools/perf/arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--tools/perf/builtin-lock.c2
-rwxr-xr-xtools/perf/tests/shell/lock_contention.sh14
-rw-r--r--tools/perf/trace/beauty/include/uapi/linux/fcntl.h1
-rw-r--r--tools/perf/trace/beauty/include/uapi/linux/fs.h5
-rw-r--r--tools/perf/trace/beauty/include/uapi/linux/prctl.h10
-rw-r--r--tools/perf/util/header.c10
-rw-r--r--tools/perf/util/libbfd.c38
-rw-r--r--tools/perf/util/mutex.c14
-rw-r--r--tools/perf/util/mutex.h2
-rw-r--r--tools/perf/util/symbol.c6
-rw-r--r--tools/power/acpi/tools/pfrut/pfrut.c7
-rw-r--r--tools/power/cpupower/Makefile32
-rw-r--r--tools/sched_ext/Makefile4
-rw-r--r--tools/sched_ext/include/scx/common.bpf.h15
-rw-r--r--tools/sched_ext/include/scx/compat.bpf.h314
-rw-r--r--tools/sched_ext/include/scx/compat.h14
-rw-r--r--tools/sched_ext/scx_cpu0.bpf.c88
-rw-r--r--tools/sched_ext/scx_cpu0.c106
-rw-r--r--tools/sched_ext/scx_flatcg.bpf.c10
-rw-r--r--tools/sched_ext/scx_qmap.bpf.c52
-rw-r--r--tools/testing/selftests/arm64/fp/fp-ptrace.c5
-rw-r--r--tools/testing/selftests/arm64/fp/sve-ptrace.c61
-rw-r--r--tools/testing/selftests/arm64/fp/zt-test.S2
-rw-r--r--tools/testing/selftests/bpf/config3
-rw-r--r--tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c107
-rw-r--r--tools/testing/selftests/bpf/prog_tests/mptcp.c140
-rw-r--r--tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c150
-rw-r--r--tools/testing/selftests/bpf/progs/iters_looping.c53
-rw-r--r--tools/testing/selftests/bpf/progs/livepatch_trampoline.c30
-rw-r--r--tools/testing/selftests/bpf/progs/mptcp_sockmap.c43
-rw-r--r--tools/testing/selftests/bpf/progs/stacktrace_ips.c49
-rw-r--r--tools/testing/selftests/bpf/progs/stream_fail.c6
-rw-r--r--tools/testing/selftests/bpf/progs/task_work.c6
-rw-r--r--tools/testing/selftests/bpf/progs/task_work_fail.c8
-rw-r--r--tools/testing/selftests/bpf/progs/task_work_stress.c4
-rw-r--r--tools/testing/selftests/bpf/test_kmods/bpf_testmod.c26
-rw-r--r--tools/testing/selftests/cachestat/.gitignore1
-rw-r--r--tools/testing/selftests/cachestat/test_cachestat.c4
-rw-r--r--tools/testing/selftests/cgroup/test_core.c7
-rw-r--r--tools/testing/selftests/cgroup/test_cpu.c7
-rw-r--r--tools/testing/selftests/cgroup/test_cpuset.c7
-rw-r--r--tools/testing/selftests/cgroup/test_freezer.c7
-rw-r--r--tools/testing/selftests/cgroup/test_kill.c7
-rw-r--r--tools/testing/selftests/cgroup/test_kmem.c7
-rw-r--r--tools/testing/selftests/cgroup/test_memcontrol.c7
-rw-r--r--tools/testing/selftests/cgroup/test_zswap.c7
-rw-r--r--tools/testing/selftests/coredump/.gitignore4
-rw-r--r--tools/testing/selftests/coredump/Makefile8
-rw-r--r--tools/testing/selftests/coredump/coredump_socket_protocol_test.c1568
-rw-r--r--tools/testing/selftests/coredump/coredump_socket_test.c742
-rw-r--r--tools/testing/selftests/coredump/coredump_test.h59
-rw-r--r--tools/testing/selftests/coredump/coredump_test_helpers.c383
-rw-r--r--tools/testing/selftests/coredump/stackdump_test.c1662
-rw-r--r--tools/testing/selftests/dma/dma_map_benchmark.c2
-rw-r--r--tools/testing/selftests/drivers/net/Makefile1
-rw-r--r--tools/testing/selftests/drivers/net/bonding/Makefile2
-rw-r--r--tools/testing/selftests/drivers/net/bonding/config4
-rwxr-xr-xtools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh361
-rw-r--r--tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh78
-rwxr-xr-xtools/testing/selftests/drivers/net/netcons_torture.sh130
-rw-r--r--tools/testing/selftests/drivers/net/netdevsim/Makefile4
-rw-r--r--tools/testing/selftests/filesystems/utils.c2
-rw-r--r--tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc107
-rw-r--r--tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc18
-rw-r--r--tools/testing/selftests/ftrace/test.d/dynevent/enable_disable_tprobe.tc40
-rw-r--r--tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc4
-rw-r--r--tools/testing/selftests/iommu/iommufd.c2
-rw-r--r--tools/testing/selftests/iommu/iommufd_utils.h4
-rw-r--r--tools/testing/selftests/kselftest/runner.sh14
-rw-r--r--tools/testing/selftests/kvm/arm64/get-reg-list.c3
-rw-r--r--tools/testing/selftests/kvm/arm64/set_id_regs.c10
-rw-r--r--tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c9
-rw-r--r--tools/testing/selftests/livepatch/functions.sh6
-rw-r--r--tools/testing/selftests/mm/uffd-unit-tests.c15
-rw-r--r--tools/testing/selftests/namespaces/.gitignore9
-rw-r--r--tools/testing/selftests/namespaces/Makefile24
-rw-r--r--tools/testing/selftests/namespaces/cred_change_test.c814
-rw-r--r--tools/testing/selftests/namespaces/listns_efault_test.c530
-rw-r--r--tools/testing/selftests/namespaces/listns_pagination_bug.c138
-rw-r--r--tools/testing/selftests/namespaces/listns_permissions_test.c759
-rw-r--r--tools/testing/selftests/namespaces/listns_test.c679
-rw-r--r--tools/testing/selftests/namespaces/ns_active_ref_test.c2672
-rw-r--r--tools/testing/selftests/namespaces/nsid_test.c107
-rw-r--r--tools/testing/selftests/namespaces/regression_pidfd_setns_test.c113
-rw-r--r--tools/testing/selftests/namespaces/siocgskns_test.c1824
-rw-r--r--tools/testing/selftests/namespaces/stress_test.c626
-rw-r--r--tools/testing/selftests/namespaces/wrappers.h35
-rw-r--r--tools/testing/selftests/net/.gitignore1
-rw-r--r--tools/testing/selftests/net/af_unix/Makefile1
-rw-r--r--tools/testing/selftests/net/af_unix/so_peek_off.c162
-rwxr-xr-xtools/testing/selftests/net/bareudp.sh2
-rwxr-xr-xtools/testing/selftests/net/forwarding/lib_sh_test.sh7
-rwxr-xr-xtools/testing/selftests/net/forwarding/local_termination.sh2
-rw-r--r--tools/testing/selftests/net/gro.c12
-rw-r--r--tools/testing/selftests/net/lib.sh2
-rw-r--r--tools/testing/selftests/net/mptcp/mptcp_connect.c18
-rwxr-xr-xtools/testing/selftests/net/mptcp/mptcp_connect.sh2
-rwxr-xr-xtools/testing/selftests/net/mptcp/mptcp_join.sh99
-rw-r--r--tools/testing/selftests/net/mptcp/mptcp_lib.sh21
-rw-r--r--tools/testing/selftests/nolibc/Makefile.nolibc6
-rw-r--r--tools/testing/selftests/nolibc/nolibc-test.c13
-rwxr-xr-xtools/testing/selftests/nolibc/run-tests.sh8
-rw-r--r--tools/testing/selftests/pidfd/pidfd.h15
-rw-r--r--tools/testing/selftests/pidfd/pidfd_info_test.c73
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-again.sh56
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-series.sh116
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm.sh2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE041
-rw-r--r--tools/testing/selftests/rseq/rseq-s390.h39
-rwxr-xr-xtools/testing/selftests/run_kselftest.sh14
-rw-r--r--tools/testing/selftests/sched_ext/Makefile1
-rw-r--r--tools/testing/selftests/sched_ext/peek_dsq.bpf.c251
-rw-r--r--tools/testing/selftests/sched_ext/peek_dsq.c224
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json44
-rw-r--r--tools/testing/selftests/timers/nanosleep.c55
-rw-r--r--tools/testing/selftests/timers/posix_timers.c32
-rw-r--r--tools/testing/selftests/user_events/perf_test.c2
-rw-r--r--tools/testing/selftests/vDSO/vdso_config.h4
-rw-r--r--tools/testing/selftests/vfio/lib/include/vfio_util.h46
-rw-r--r--tools/testing/selftests/vfio/lib/vfio_pci_device.c350
-rw-r--r--tools/testing/selftests/vfio/vfio_dma_mapping_test.c111
-rw-r--r--tools/testing/selftests/vfio/vfio_pci_driver_test.c12
-rwxr-xr-xtools/testing/selftests/vsock/vmtest.sh8
-rw-r--r--tools/testing/selftests/x86/test_vsyscall.c21
-rw-r--r--tools/thermal/thermal-engine/thermal-engine.c2
-rw-r--r--tools/tracing/latency/latency-collector.c2
271 files changed, 30959 insertions, 3506 deletions
diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h
index 139d5e87dc95..b35d954d50c3 100644
--- a/tools/arch/arm64/include/asm/cputype.h
+++ b/tools/arch/arm64/include/asm/cputype.h
@@ -245,7 +245,7 @@
#define MIDR_FUJITSU_ERRATUM_010001_MASK (~MIDR_CPU_VAR_REV(1, 0))
#define TCR_CLEAR_FUJITSU_ERRATUM_010001 (TCR_NFD1 | TCR_NFD0)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/sysreg.h>
@@ -338,6 +338,6 @@ static inline u32 __attribute_const__ read_cpuid_cachetype(void)
{
return read_cpuid(CTR_EL0);
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif
diff --git a/tools/arch/arm64/include/asm/esr.h b/tools/arch/arm64/include/asm/esr.h
index bd592ca81571..bbfbd1497a2f 100644
--- a/tools/arch/arm64/include/asm/esr.h
+++ b/tools/arch/arm64/include/asm/esr.h
@@ -385,7 +385,7 @@
#define ESR_ELx_MOPS_ISS_SRCREG(esr) (((esr) & (UL(0x1f) << 5)) >> 5)
#define ESR_ELx_MOPS_ISS_SIZEREG(esr) (((esr) & (UL(0x1f) << 0)) >> 0)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/types.h>
static inline unsigned long esr_brk_comment(unsigned long esr)
@@ -450,6 +450,6 @@ static inline bool esr_iss_is_eretab(unsigned long esr)
}
const char *esr_get_class_string(unsigned long esr);
-#endif /* __ASSEMBLY */
+#endif /* __ASSEMBLER__ */
#endif /* __ASM_ESR_H */
diff --git a/tools/arch/arm64/include/asm/gpr-num.h b/tools/arch/arm64/include/asm/gpr-num.h
index 05da4a7c5788..a114e4f8209b 100644
--- a/tools/arch/arm64/include/asm/gpr-num.h
+++ b/tools/arch/arm64/include/asm/gpr-num.h
@@ -2,7 +2,7 @@
#ifndef __ASM_GPR_NUM_H
#define __ASM_GPR_NUM_H
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
.irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
.equ .L__gpr_num_x\num, \num
@@ -11,7 +11,7 @@
.equ .L__gpr_num_xzr, 31
.equ .L__gpr_num_wzr, 31
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
#define __DEFINE_ASM_GPR_NUMS \
" .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" \
@@ -21,6 +21,6 @@
" .equ .L__gpr_num_xzr, 31\n" \
" .equ .L__gpr_num_wzr, 31\n"
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __ASM_GPR_NUM_H */
diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h
index 65f2759ea27a..178b7322bf04 100644
--- a/tools/arch/arm64/include/asm/sysreg.h
+++ b/tools/arch/arm64/include/asm/sysreg.h
@@ -51,7 +51,7 @@
#ifndef CONFIG_BROKEN_GAS_INST
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
// The space separator is omitted so that __emit_inst(x) can be parsed as
// either an assembler directive or an assembler macro argument.
#define __emit_inst(x) .inst(x)
@@ -70,11 +70,11 @@
(((x) >> 24) & 0x000000ff))
#endif /* CONFIG_CPU_BIG_ENDIAN */
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#define __emit_inst(x) .long __INSTR_BSWAP(x)
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
#define __emit_inst(x) ".long " __stringify(__INSTR_BSWAP(x)) "\n\t"
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* CONFIG_BROKEN_GAS_INST */
@@ -1078,9 +1078,7 @@
#define GCS_CAP(x) ((((unsigned long)x) & GCS_CAP_ADDR_MASK) | \
GCS_CAP_VALID_TOKEN)
-#define ARM64_FEATURE_FIELD_BITS 4
-
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
.macro mrs_s, rt, sreg
__emit_inst(0xd5200000|(\sreg)|(.L__gpr_num_\rt))
diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h
index ed5f3892674c..a792a599b9d6 100644
--- a/tools/arch/arm64/include/uapi/asm/kvm.h
+++ b/tools/arch/arm64/include/uapi/asm/kvm.h
@@ -31,7 +31,7 @@
#define KVM_SPSR_FIQ 4
#define KVM_NR_SPSR 5
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/psci.h>
#include <linux/types.h>
#include <asm/ptrace.h>
diff --git a/tools/arch/riscv/include/asm/csr.h b/tools/arch/riscv/include/asm/csr.h
index 56d7367ee344..21d8cee04638 100644
--- a/tools/arch/riscv/include/asm/csr.h
+++ b/tools/arch/riscv/include/asm/csr.h
@@ -167,7 +167,8 @@
#define VSIP_TO_HVIP_SHIFT (IRQ_VS_SOFT - IRQ_S_SOFT)
#define VSIP_VALID_MASK ((_AC(1, UL) << IRQ_S_SOFT) | \
(_AC(1, UL) << IRQ_S_TIMER) | \
- (_AC(1, UL) << IRQ_S_EXT))
+ (_AC(1, UL) << IRQ_S_EXT) | \
+ (_AC(1, UL) << IRQ_PMU_OVF))
/* AIA CSR bits */
#define TOPI_IID_SHIFT 16
@@ -280,7 +281,7 @@
#define CSR_HPMCOUNTER30H 0xc9e
#define CSR_HPMCOUNTER31H 0xc9f
-#define CSR_SSCOUNTOVF 0xda0
+#define CSR_SCOUNTOVF 0xda0
#define CSR_SSTATUS 0x100
#define CSR_SIE 0x104
diff --git a/tools/arch/s390/include/uapi/asm/bitsperlong.h b/tools/arch/s390/include/uapi/asm/bitsperlong.h
index d2bb620119bf..a226a1686a53 100644
--- a/tools/arch/s390/include/uapi/asm/bitsperlong.h
+++ b/tools/arch/s390/include/uapi/asm/bitsperlong.h
@@ -2,11 +2,7 @@
#ifndef __ASM_S390_BITSPERLONG_H
#define __ASM_S390_BITSPERLONG_H
-#ifndef __s390x__
-#define __BITS_PER_LONG 32
-#else
#define __BITS_PER_LONG 64
-#endif
#include <asm-generic/bitsperlong.h>
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 06fc0479a23f..ccc01ad6ff7c 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -320,7 +320,7 @@
#define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */
#define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */
#define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */
-#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */
+#define X86_FEATURE_LKGS (12*32+18) /* Like MOV_GS except MSR_KERNEL_GS_BASE = GS.base */
#define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */
#define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */
#define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */
@@ -407,9 +407,12 @@
#define X86_FEATURE_ENQCMD (16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */
#define X86_FEATURE_SGX_LC (16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */
-/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
+/*
+ * Linux-defined word for use with scattered/synthetic bits.
+ */
#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */
#define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */
+
#define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
@@ -444,6 +447,7 @@
#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */
#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" Secure Encrypted Virtualization - Encrypted State */
#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" Secure Encrypted Virtualization - Secure Nested Paging */
+#define X86_FEATURE_SNP_SECURE_TSC (19*32+ 8) /* SEV-SNP Secure TSC */
#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */
#define X86_FEATURE_SME_COHERENT (19*32+10) /* hardware-enforced cache coherency */
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */
@@ -495,6 +499,9 @@
#define X86_FEATURE_TSA_SQ_NO (21*32+11) /* AMD CPU not vulnerable to TSA-SQ */
#define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */
#define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */
+#define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */
+#define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */
+#define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */
/*
* BUG word(s)
@@ -551,4 +558,5 @@
#define X86_BUG_ITS X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */
#define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */
#define X86_BUG_TSA X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */
+#define X86_BUG_VMSCAPE X86_BUG( 1*32+10) /* "vmscape" CPU is affected by VMSCAPE attacks from guests */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h
index c683d609934b..8f10f2943370 100644
--- a/tools/arch/x86/include/asm/insn.h
+++ b/tools/arch/x86/include/asm/insn.h
@@ -312,7 +312,6 @@ static inline int insn_offset_immediate(struct insn *insn)
/**
* for_each_insn_prefix() -- Iterate prefixes in the instruction
* @insn: Pointer to struct insn.
- * @idx: Index storage.
* @prefix: Prefix byte.
*
* Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix
@@ -321,8 +320,8 @@ static inline int insn_offset_immediate(struct insn *insn)
* Since prefixes.nbytes can be bigger than 4 if some prefixes
* are repeated, it cannot be used for looping over the prefixes.
*/
-#define for_each_insn_prefix(insn, idx, prefix) \
- for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++)
+#define for_each_insn_prefix(insn, prefix) \
+ for (int idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++)
#define POP_SS_OPCODE 0x1f
#define MOV_SREG_OPCODE 0x8e
diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
index f627196eb796..9e1720d73244 100644
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -315,9 +315,12 @@
#define PERF_CAP_PT_IDX 16
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
+
+#define PERF_CAP_LBR_FMT 0x3f
#define PERF_CAP_PEBS_TRAP BIT_ULL(6)
#define PERF_CAP_ARCH_REG BIT_ULL(7)
#define PERF_CAP_PEBS_FORMAT 0xf00
+#define PERF_CAP_FW_WRITES BIT_ULL(13)
#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17)
#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
@@ -633,6 +636,11 @@
#define MSR_AMD_PPIN 0xc00102f1
#define MSR_AMD64_CPUID_FN_7 0xc0011002
#define MSR_AMD64_CPUID_FN_1 0xc0011004
+
+#define MSR_AMD64_CPUID_EXT_FEAT 0xc0011005
+#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT 54
+#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT BIT_ULL(MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT)
+
#define MSR_AMD64_LS_CFG 0xc0011020
#define MSR_AMD64_DC_CFG 0xc0011022
#define MSR_AMD64_TW_CFG 0xc0011023
@@ -701,8 +709,15 @@
#define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT)
#define MSR_AMD64_SNP_SMT_PROT_BIT 17
#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT)
-#define MSR_AMD64_SNP_RESV_BIT 18
+#define MSR_AMD64_SNP_SECURE_AVIC_BIT 18
+#define MSR_AMD64_SNP_SECURE_AVIC BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT)
+#define MSR_AMD64_SNP_RESV_BIT 19
#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT)
+#define MSR_AMD64_SAVIC_CONTROL 0xc0010138
+#define MSR_AMD64_SAVIC_EN_BIT 0
+#define MSR_AMD64_SAVIC_EN BIT_ULL(MSR_AMD64_SAVIC_EN_BIT)
+#define MSR_AMD64_SAVIC_ALLOWEDNMI_BIT 1
+#define MSR_AMD64_SAVIC_ALLOWEDNMI BIT_ULL(MSR_AMD64_SAVIC_ALLOWEDNMI_BIT)
#define MSR_AMD64_RMP_BASE 0xc0010132
#define MSR_AMD64_RMP_END 0xc0010133
#define MSR_AMD64_RMP_CFG 0xc0010136
@@ -735,6 +750,7 @@
#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300
#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301
#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302
+#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET 0xc0000303
/* AMD Hardware Feedback Support MSRs */
#define MSR_AMD_WORKLOAD_CLASS_CONFIG 0xc0000500
@@ -1225,6 +1241,8 @@
/* - AMD: */
#define MSR_IA32_MBA_BW_BASE 0xc0000200
#define MSR_IA32_SMBA_BW_BASE 0xc0000280
+#define MSR_IA32_L3_QOS_ABMC_CFG 0xc00003fd
+#define MSR_IA32_L3_QOS_EXT_CFG 0xc00003ff
#define MSR_IA32_EVT_CFG_BASE 0xc0000400
/* AMD-V MSRs */
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index 0f15d683817d..d420c9c066d4 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -35,6 +35,11 @@
#define MC_VECTOR 18
#define XM_VECTOR 19
#define VE_VECTOR 20
+#define CP_VECTOR 21
+
+#define HV_VECTOR 28
+#define VC_VECTOR 29
+#define SX_VECTOR 30
/* Select x86 specific features in <linux/kvm.h> */
#define __KVM_HAVE_PIT
@@ -411,6 +416,35 @@ struct kvm_xcrs {
__u64 padding[16];
};
+#define KVM_X86_REG_TYPE_MSR 2
+#define KVM_X86_REG_TYPE_KVM 3
+
+#define KVM_X86_KVM_REG_SIZE(reg) \
+({ \
+ reg == KVM_REG_GUEST_SSP ? KVM_REG_SIZE_U64 : 0; \
+})
+
+#define KVM_X86_REG_TYPE_SIZE(type, reg) \
+({ \
+ __u64 type_size = (__u64)type << 32; \
+ \
+ type_size |= type == KVM_X86_REG_TYPE_MSR ? KVM_REG_SIZE_U64 : \
+ type == KVM_X86_REG_TYPE_KVM ? KVM_X86_KVM_REG_SIZE(reg) : \
+ 0; \
+ type_size; \
+})
+
+#define KVM_X86_REG_ID(type, index) \
+ (KVM_REG_X86 | KVM_X86_REG_TYPE_SIZE(type, index) | index)
+
+#define KVM_X86_REG_MSR(index) \
+ KVM_X86_REG_ID(KVM_X86_REG_TYPE_MSR, index)
+#define KVM_X86_REG_KVM(index) \
+ KVM_X86_REG_ID(KVM_X86_REG_TYPE_KVM, index)
+
+/* KVM-defined registers starting from 0 */
+#define KVM_REG_GUEST_SSP 0
+
#define KVM_SYNC_X86_REGS (1UL << 0)
#define KVM_SYNC_X86_SREGS (1UL << 1)
#define KVM_SYNC_X86_EVENTS (1UL << 2)
diff --git a/tools/arch/x86/include/uapi/asm/svm.h b/tools/arch/x86/include/uapi/asm/svm.h
index 9c640a521a67..650e3256ea7d 100644
--- a/tools/arch/x86/include/uapi/asm/svm.h
+++ b/tools/arch/x86/include/uapi/asm/svm.h
@@ -118,6 +118,10 @@
#define SVM_VMGEXIT_AP_CREATE 1
#define SVM_VMGEXIT_AP_DESTROY 2
#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018
+#define SVM_VMGEXIT_SAVIC 0x8000001a
+#define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0
+#define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1
+#define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL
#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd
#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe
#define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \
diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h
index f0f4a4cf84a7..1baa86dfe029 100644
--- a/tools/arch/x86/include/uapi/asm/vmx.h
+++ b/tools/arch/x86/include/uapi/asm/vmx.h
@@ -93,7 +93,10 @@
#define EXIT_REASON_TPAUSE 68
#define EXIT_REASON_BUS_LOCK 74
#define EXIT_REASON_NOTIFY 75
+#define EXIT_REASON_SEAMCALL 76
#define EXIT_REASON_TDCALL 77
+#define EXIT_REASON_MSR_READ_IMM 84
+#define EXIT_REASON_MSR_WRITE_IMM 85
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -158,7 +161,9 @@
{ EXIT_REASON_TPAUSE, "TPAUSE" }, \
{ EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \
{ EXIT_REASON_NOTIFY, "NOTIFY" }, \
- { EXIT_REASON_TDCALL, "TDCALL" }
+ { EXIT_REASON_TDCALL, "TDCALL" }, \
+ { EXIT_REASON_MSR_READ_IMM, "MSR_READ_IMM" }, \
+ { EXIT_REASON_MSR_WRITE_IMM, "MSR_WRITE_IMM" }
#define VMX_EXIT_REASON_FLAGS \
{ VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" }
diff --git a/tools/arch/x86/tools/gen-cpu-feature-names-x86.awk b/tools/arch/x86/tools/gen-cpu-feature-names-x86.awk
new file mode 100644
index 000000000000..cc4c7a3e6c2e
--- /dev/null
+++ b/tools/arch/x86/tools/gen-cpu-feature-names-x86.awk
@@ -0,0 +1,34 @@
+#!/bin/awk -f
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2025, Oracle and/or its affiliates.
+#
+# Usage: awk -f gen-cpu-feature-names-x86.awk cpufeatures.h > cpu-feature-names.c
+#
+
+BEGIN {
+ print "/* cpu feature name array generated from cpufeatures.h */"
+ print "/* Do not change this code. */"
+ print
+ print "static const char *cpu_feature_names[(NCAPINTS+NBUGINTS)*32] = {"
+
+ value_expr = "\\([0-9*+ ]+\\)"
+}
+
+/^#define X86_FEATURE_/ {
+ if (match($0, value_expr)) {
+ value = substr($0, RSTART + 1, RLENGTH - 2)
+ print "\t[" value "] = \"" $2 "\","
+ }
+}
+
+/^#define X86_BUG_/ {
+ if (match($0, value_expr)) {
+ value = substr($0, RSTART + 1, RLENGTH - 2)
+ print "\t[NCAPINTS*32+(" value ")] = \"" $2 "\","
+ }
+}
+
+END {
+ print "};"
+}
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 009633294b09..35aeeaf5f711 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -182,7 +182,7 @@ bpftool prog tracelog
bpftool prog tracelog { stdout | stderr } *PROG*
Dump the BPF stream of the program. BPF programs can write to these streams
- at runtime with the **bpf_stream_vprintk**\ () kfunc. The kernel may write
+ at runtime with the **bpf_stream_vprintk_impl**\ () kfunc. The kernel may write
error messages to the standard error stream. This facility should be used
only for debugging purposes.
diff --git a/tools/build/Build b/tools/build/Build
new file mode 100644
index 000000000000..1c7e598e9f59
--- /dev/null
+++ b/tools/build/Build
@@ -0,0 +1,2 @@
+hostprogs := fixdep
+fixdep-y := fixdep.o
diff --git a/tools/build/Makefile b/tools/build/Makefile
index 63ef21878761..3a5a3808ab2a 100644
--- a/tools/build/Makefile
+++ b/tools/build/Makefile
@@ -37,5 +37,22 @@ ifneq ($(wildcard $(TMP_O)),)
$(Q)$(MAKE) -C feature OUTPUT=$(TMP_O) clean >/dev/null
endif
-$(OUTPUT)fixdep: $(srctree)/tools/build/fixdep.c
- $(QUIET_CC)$(HOSTCC) $(KBUILD_HOSTCFLAGS) $(KBUILD_HOSTLDFLAGS) -o $@ $<
+FIXDEP := $(OUTPUT)fixdep
+FIXDEP_IN := $(OUTPUT)fixdep-in.o
+
+# To track fixdep's dependencies properly, fixdep needs to run on itself.
+# Build it twice the first time.
+$(FIXDEP_IN): FORCE
+ $(Q)if [ ! -f $(FIXDEP) ]; then \
+ $(MAKE) $(build)=fixdep HOSTCFLAGS="$(KBUILD_HOSTCFLAGS)"; \
+ rm -f $(FIXDEP).o; \
+ fi
+ $(Q)$(MAKE) $(build)=fixdep HOSTCFLAGS="$(KBUILD_HOSTCFLAGS)"
+
+
+$(FIXDEP): $(FIXDEP_IN)
+ $(QUIET_LINK)$(HOSTCC) $(FIXDEP_IN) $(KBUILD_HOSTLDFLAGS) -o $@
+
+FORCE:
+
+.PHONY: FORCE
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 32bbe29fe5f6..300a329bc581 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -315,5 +315,7 @@ endef
ifeq ($(FEATURE_DISPLAY_DEFERRED),)
$(call feature_display_entries)
- $(info )
+ ifeq ($(feature_display),1)
+ $(info )
+ endif
endif
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 49b0add392b1..95646290cb89 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -107,7 +107,7 @@ all: $(FILES)
__BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS)
BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1
BUILD_BFD = $(BUILD) -DPACKAGE='"perf"' -lbfd -ldl
- BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -lzstd
+ BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -ldl -lz -llzma -lzstd
__BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS)
BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1
@@ -115,7 +115,7 @@ __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(
###############################
$(OUTPUT)test-all.bin:
- $(BUILD_ALL) || $(BUILD_ALL) -lopcodes -liberty
+ $(BUILD_ALL)
$(OUTPUT)test-hello.bin:
$(BUILD)
diff --git a/tools/docs/check-variable-fonts.py b/tools/docs/check-variable-fonts.py
new file mode 100755
index 000000000000..958d5a745724
--- /dev/null
+++ b/tools/docs/check-variable-fonts.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) Akira Yokosawa, 2024
+#
+# Ported to Python by (c) Mauro Carvalho Chehab, 2025
+#
+# pylint: disable=C0103
+
+"""
+Detect problematic Noto CJK variable fonts.
+
+or more details, see .../tools/lib/python/kdoc/latex_fonts.py.
+"""
+
+import argparse
+import sys
+import os.path
+
+src_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, os.path.join(src_dir, '../lib/python'))
+
+from kdoc.latex_fonts import LatexFontChecker
+
+checker = LatexFontChecker()
+
+parser=argparse.ArgumentParser(description=checker.description(),
+ formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument("--deny-vf",
+ help="XDG_CONFIG_HOME dir containing fontconfig/fonts.conf file")
+
+args=parser.parse_args()
+
+msg = LatexFontChecker(args.deny_vf).check()
+if msg:
+ print(msg)
+
+sys.exit(1)
diff --git a/tools/docs/checktransupdate.py b/tools/docs/checktransupdate.py
new file mode 100755
index 000000000000..e894652369a5
--- /dev/null
+++ b/tools/docs/checktransupdate.py
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+"""
+This script helps track the translation status of the documentation
+in different locales, e.g., zh_CN. More specially, it uses `git log`
+commit to find the latest english commit from the translation commit
+(order by author date) and the latest english commits from HEAD. If
+differences occur, report the file and commits that need to be updated.
+
+The usage is as follows:
+- tools/docs/checktransupdate.py -l zh_CN
+This will print all the files that need to be updated or translated in the zh_CN locale.
+- tools/docs/checktransupdate.py Documentation/translations/zh_CN/dev-tools/testing-overview.rst
+This will only print the status of the specified file.
+
+The output is something like:
+Documentation/dev-tools/kfence.rst
+No translation in the locale of zh_CN
+
+Documentation/translations/zh_CN/dev-tools/testing-overview.rst
+commit 42fb9cfd5b18 ("Documentation: dev-tools: Add link to RV docs")
+1 commits needs resolving in total
+"""
+
+import os
+import re
+import time
+import logging
+from argparse import ArgumentParser, ArgumentTypeError, BooleanOptionalAction
+from datetime import datetime
+
+
+def get_origin_path(file_path):
+ """Get the origin path from the translation path"""
+ paths = file_path.split("/")
+ tidx = paths.index("translations")
+ opaths = paths[:tidx]
+ opaths += paths[tidx + 2 :]
+ return "/".join(opaths)
+
+
+def get_latest_commit_from(file_path, commit):
+ """Get the latest commit from the specified commit for the specified file"""
+ command = f"git log --pretty=format:%H%n%aD%n%cD%n%n%B {commit} -1 -- {file_path}"
+ logging.debug(command)
+ pipe = os.popen(command)
+ result = pipe.read()
+ result = result.split("\n")
+ if len(result) <= 1:
+ return None
+
+ logging.debug("Result: %s", result[0])
+
+ return {
+ "hash": result[0],
+ "author_date": datetime.strptime(result[1], "%a, %d %b %Y %H:%M:%S %z"),
+ "commit_date": datetime.strptime(result[2], "%a, %d %b %Y %H:%M:%S %z"),
+ "message": result[4:],
+ }
+
+
+def get_origin_from_trans(origin_path, t_from_head):
+ """Get the latest origin commit from the translation commit"""
+ o_from_t = get_latest_commit_from(origin_path, t_from_head["hash"])
+ while o_from_t is not None and o_from_t["author_date"] > t_from_head["author_date"]:
+ o_from_t = get_latest_commit_from(origin_path, o_from_t["hash"] + "^")
+ if o_from_t is not None:
+ logging.debug("tracked origin commit id: %s", o_from_t["hash"])
+ return o_from_t
+
+
+def get_origin_from_trans_smartly(origin_path, t_from_head):
+ """Get the latest origin commit from the formatted translation commit:
+ (1) update to commit HASH (TITLE)
+ (2) Update the translation through commit HASH (TITLE)
+ """
+ # catch flag for 12-bit commit hash
+ HASH = r'([0-9a-f]{12})'
+ # pattern 1: contains "update to commit HASH"
+ pat_update_to = re.compile(rf'update to commit {HASH}')
+ # pattern 2: contains "Update the translation through commit HASH"
+ pat_update_translation = re.compile(rf'Update the translation through commit {HASH}')
+
+ origin_commit_hash = None
+ for line in t_from_head["message"]:
+ # check if the line matches the first pattern
+ match = pat_update_to.search(line)
+ if match:
+ origin_commit_hash = match.group(1)
+ break
+ # check if the line matches the second pattern
+ match = pat_update_translation.search(line)
+ if match:
+ origin_commit_hash = match.group(1)
+ break
+ if origin_commit_hash is None:
+ return None
+ o_from_t = get_latest_commit_from(origin_path, origin_commit_hash)
+ if o_from_t is not None:
+ logging.debug("tracked origin commit id: %s", o_from_t["hash"])
+ return o_from_t
+
+
+def get_commits_count_between(opath, commit1, commit2):
+ """Get the commits count between two commits for the specified file"""
+ command = f"git log --pretty=format:%H {commit1}...{commit2} -- {opath}"
+ logging.debug(command)
+ pipe = os.popen(command)
+ result = pipe.read().split("\n")
+ # filter out empty lines
+ result = list(filter(lambda x: x != "", result))
+ return result
+
+
+def pretty_output(commit):
+ """Pretty print the commit message"""
+ command = f"git log --pretty='format:%h (\"%s\")' -1 {commit}"
+ logging.debug(command)
+ pipe = os.popen(command)
+ return pipe.read()
+
+
+def valid_commit(commit):
+ """Check if the commit is valid or not"""
+ msg = pretty_output(commit)
+ return "Merge tag" not in msg
+
+def check_per_file(file_path):
+ """Check the translation status for the specified file"""
+ opath = get_origin_path(file_path)
+
+ if not os.path.isfile(opath):
+ logging.error("Cannot find the origin path for {file_path}")
+ return
+
+ o_from_head = get_latest_commit_from(opath, "HEAD")
+ t_from_head = get_latest_commit_from(file_path, "HEAD")
+
+ if o_from_head is None or t_from_head is None:
+ logging.error("Cannot find the latest commit for %s", file_path)
+ return
+
+ o_from_t = get_origin_from_trans_smartly(opath, t_from_head)
+ # notice, o_from_t from get_*_smartly() is always more accurate than from get_*()
+ if o_from_t is None:
+ o_from_t = get_origin_from_trans(opath, t_from_head)
+
+ if o_from_t is None:
+ logging.error("Error: Cannot find the latest origin commit for %s", file_path)
+ return
+
+ if o_from_head["hash"] == o_from_t["hash"]:
+ logging.debug("No update needed for %s", file_path)
+ else:
+ logging.info(file_path)
+ commits = get_commits_count_between(
+ opath, o_from_t["hash"], o_from_head["hash"]
+ )
+ count = 0
+ for commit in commits:
+ if valid_commit(commit):
+ logging.info("commit %s", pretty_output(commit))
+ count += 1
+ logging.info("%d commits needs resolving in total\n", count)
+
+
+def valid_locales(locale):
+ """Check if the locale is valid or not"""
+ script_path = os.path.dirname(os.path.abspath(__file__))
+ linux_path = os.path.join(script_path, "../..")
+ if not os.path.isdir(f"{linux_path}/Documentation/translations/{locale}"):
+ raise ArgumentTypeError("Invalid locale: {locale}")
+ return locale
+
+
+def list_files_with_excluding_folders(folder, exclude_folders, include_suffix):
+ """List all files with the specified suffix in the folder and its subfolders"""
+ files = []
+ stack = [folder]
+
+ while stack:
+ pwd = stack.pop()
+ # filter out the exclude folders
+ if os.path.basename(pwd) in exclude_folders:
+ continue
+ # list all files and folders
+ for item in os.listdir(pwd):
+ ab_item = os.path.join(pwd, item)
+ if os.path.isdir(ab_item):
+ stack.append(ab_item)
+ else:
+ if ab_item.endswith(include_suffix):
+ files.append(ab_item)
+
+ return files
+
+
+class DmesgFormatter(logging.Formatter):
+ """Custom dmesg logging formatter"""
+ def format(self, record):
+ timestamp = time.time()
+ formatted_time = f"[{timestamp:>10.6f}]"
+ log_message = f"{formatted_time} {record.getMessage()}"
+ return log_message
+
+
+def config_logging(log_level, log_file="checktransupdate.log"):
+ """configure logging based on the log level"""
+ # set up the root logger
+ logger = logging.getLogger()
+ logger.setLevel(log_level)
+
+ # Create console handler
+ console_handler = logging.StreamHandler()
+ console_handler.setLevel(log_level)
+
+ # Create file handler
+ file_handler = logging.FileHandler(log_file)
+ file_handler.setLevel(log_level)
+
+ # Create formatter and add it to the handlers
+ formatter = DmesgFormatter()
+ console_handler.setFormatter(formatter)
+ file_handler.setFormatter(formatter)
+
+ # Add the handler to the logger
+ logger.addHandler(console_handler)
+ logger.addHandler(file_handler)
+
+
+def main():
+ """Main function of the script"""
+ script_path = os.path.dirname(os.path.abspath(__file__))
+ linux_path = os.path.join(script_path, "../..")
+
+ parser = ArgumentParser(description="Check the translation update")
+ parser.add_argument(
+ "-l",
+ "--locale",
+ default="zh_CN",
+ type=valid_locales,
+ help="Locale to check when files are not specified",
+ )
+
+ parser.add_argument(
+ "--print-missing-translations",
+ action=BooleanOptionalAction,
+ default=True,
+ help="Print files that do not have translations",
+ )
+
+ parser.add_argument(
+ '--log',
+ default='INFO',
+ choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
+ help='Set the logging level')
+
+ parser.add_argument(
+ '--logfile',
+ default='checktransupdate.log',
+ help='Set the logging file (default: checktransupdate.log)')
+
+ parser.add_argument(
+ "files", nargs="*", help="Files to check, if not specified, check all files"
+ )
+ args = parser.parse_args()
+
+ # Configure logging based on the --log argument
+ log_level = getattr(logging, args.log.upper(), logging.INFO)
+ config_logging(log_level)
+
+ # Get files related to linux path
+ files = args.files
+ if len(files) == 0:
+ offical_files = list_files_with_excluding_folders(
+ os.path.join(linux_path, "Documentation"), ["translations", "output"], "rst"
+ )
+
+ for file in offical_files:
+ # split the path into parts
+ path_parts = file.split(os.sep)
+ # find the index of the "Documentation" directory
+ kindex = path_parts.index("Documentation")
+ # insert the translations and locale after the Documentation directory
+ new_path_parts = path_parts[:kindex + 1] + ["translations", args.locale] \
+ + path_parts[kindex + 1 :]
+ # join the path parts back together
+ new_file = os.sep.join(new_path_parts)
+ if os.path.isfile(new_file):
+ files.append(new_file)
+ else:
+ if args.print_missing_translations:
+ logging.info(os.path.relpath(os.path.abspath(file), linux_path))
+ logging.info("No translation in the locale of %s\n", args.locale)
+
+ files = list(map(lambda x: os.path.relpath(os.path.abspath(x), linux_path), files))
+
+ # cd to linux root directory
+ os.chdir(linux_path)
+
+ for file in files:
+ check_per_file(file)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/docs/documentation-file-ref-check b/tools/docs/documentation-file-ref-check
new file mode 100755
index 000000000000..0cad42f6943b
--- /dev/null
+++ b/tools/docs/documentation-file-ref-check
@@ -0,0 +1,245 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+#
+# Treewide grep for references to files under Documentation, and report
+# non-existing files in stderr.
+
+use warnings;
+use strict;
+use Getopt::Long qw(:config no_auto_abbrev);
+
+# NOTE: only add things here when the file was gone, but the text wants
+# to mention a past documentation file, for example, to give credits for
+# the original work.
+my %false_positives = (
+ "Documentation/scsi/scsi_mid_low_api.rst" => "Documentation/Configure.help",
+ "drivers/vhost/vhost.c" => "Documentation/virtual/lguest/lguest.c",
+);
+
+my $scriptname = $0;
+$scriptname =~ s,tools/docs/([^/]+/),$1,;
+
+# Parse arguments
+my $help = 0;
+my $fix = 0;
+my $warn = 0;
+
+if (! -e ".git") {
+ printf "Warning: can't check if file exists, as this is not a git tree\n";
+ exit 0;
+}
+
+GetOptions(
+ 'fix' => \$fix,
+ 'warn' => \$warn,
+ 'h|help|usage' => \$help,
+);
+
+if ($help != 0) {
+ print "$scriptname [--help] [--fix]\n";
+ exit -1;
+}
+
+# Step 1: find broken references
+print "Finding broken references. This may take a while... " if ($fix);
+
+my %broken_ref;
+
+my $doc_fix = 0;
+
+open IN, "git grep ':doc:\`' Documentation/|"
+ or die "Failed to run git grep";
+while (<IN>) {
+ next if (!m,^([^:]+):.*\:doc\:\`([^\`]+)\`,);
+ next if (m,sphinx/,);
+
+ my $file = $1;
+ my $d = $1;
+ my $doc_ref = $2;
+
+ my $f = $doc_ref;
+
+ $d =~ s,(.*/).*,$1,;
+ $f =~ s,.*\<([^\>]+)\>,$1,;
+
+ if ($f =~ m,^/,) {
+ $f = "$f.rst";
+ $f =~ s,^/,Documentation/,;
+ } else {
+ $f = "$d$f.rst";
+ }
+
+ next if (grep -e, glob("$f"));
+
+ if ($fix && !$doc_fix) {
+ print STDERR "\nWARNING: Currently, can't fix broken :doc:`` fields\n";
+ }
+ $doc_fix++;
+
+ print STDERR "$file: :doc:`$doc_ref`\n";
+}
+close IN;
+
+open IN, "git grep 'Documentation/'|"
+ or die "Failed to run git grep";
+while (<IN>) {
+ next if (!m/^([^:]+):(.*)/);
+
+ my $f = $1;
+ my $ln = $2;
+
+ # On linux-next, discard the Next/ directory
+ next if ($f =~ m,^Next/,);
+
+ # Makefiles and scripts contain nasty expressions to parse docs
+ next if ($f =~ m/Makefile/ || $f =~ m/\.(sh|py|pl|~|rej|org|orig)$/);
+
+ # It doesn't make sense to parse hidden files
+ next if ($f =~ m#/\.#);
+
+ # Skip this script
+ next if ($f eq $scriptname);
+
+ # Ignore the dir where documentation will be built
+ next if ($ln =~ m,\b(\S*)Documentation/output,);
+
+ if ($ln =~ m,\b(\S*)(Documentation/[A-Za-z0-9\_\.\,\~/\*\[\]\?+-]*)(.*),) {
+ my $prefix = $1;
+ my $ref = $2;
+ my $base = $2;
+ my $extra = $3;
+
+ # some file references are like:
+ # /usr/src/linux/Documentation/DMA-{API,mapping}.txt
+ # For now, ignore them
+ next if ($extra =~ m/^{/);
+
+ # Remove footnotes at the end like:
+ # Documentation/devicetree/dt-object-internal.txt[1]
+ $ref =~ s/(txt|rst)\[\d+]$/$1/;
+
+ # Remove ending ']' without any '['
+ $ref =~ s/\].*// if (!($ref =~ m/\[/));
+
+ # Remove puntuation marks at the end
+ $ref =~ s/[\,\.]+$//;
+
+ my $fulref = "$prefix$ref";
+
+ $fulref =~ s/^(\<file|ref)://;
+ $fulref =~ s/^[\'\`]+//;
+ $fulref =~ s,^\$\(.*\)/,,;
+ $base =~ s,.*/,,;
+
+ # Remove URL false-positives
+ next if ($fulref =~ m/^http/);
+
+ # Remove sched-pelt false-positive
+ next if ($fulref =~ m,^Documentation/scheduler/sched-pelt$,);
+
+ # Discard some build examples from Documentation/target/tcm_mod_builder.rst
+ next if ($fulref =~ m,mnt/sdb/lio-core-2.6.git/Documentation/target,);
+
+ # Check if exists, evaluating wildcards
+ next if (grep -e, glob("$ref $fulref"));
+
+ # Accept relative Documentation patches for tools/
+ if ($f =~ m/tools/) {
+ my $path = $f;
+ $path =~ s,(.*)/.*,$1,;
+ $path =~ s,testing/selftests/bpf,bpf/bpftool,;
+ next if (grep -e, glob("$path/$ref $path/../$ref $path/$fulref"));
+ }
+
+ # Discard known false-positives
+ if (defined($false_positives{$f})) {
+ next if ($false_positives{$f} eq $fulref);
+ }
+
+ if ($fix) {
+ if (!($ref =~ m/(scripts|Kconfig|Kbuild)/)) {
+ $broken_ref{$ref}++;
+ }
+ } elsif ($warn) {
+ print STDERR "Warning: $f references a file that doesn't exist: $fulref\n";
+ } else {
+ print STDERR "$f: $fulref\n";
+ }
+ }
+}
+close IN;
+
+exit 0 if (!$fix);
+
+# Step 2: Seek for file name alternatives
+print "Auto-fixing broken references. Please double-check the results\n";
+
+foreach my $ref (keys %broken_ref) {
+ my $new =$ref;
+
+ my $basedir = ".";
+ # On translations, only seek inside the translations directory
+ $basedir = $1 if ($ref =~ m,(Documentation/translations/[^/]+),);
+
+ # get just the basename
+ $new =~ s,.*/,,;
+
+ my $f="";
+
+ # usual reason for breakage: DT file moved around
+ if ($ref =~ /devicetree/) {
+ # usual reason for breakage: DT file renamed to .yaml
+ if (!$f) {
+ my $new_ref = $ref;
+ $new_ref =~ s/\.txt$/.yaml/;
+ $f=$new_ref if (-f $new_ref);
+ }
+
+ if (!$f) {
+ my $search = $new;
+ $search =~ s,^.*/,,;
+ $f = qx(find Documentation/devicetree/ -iname "*$search*") if ($search);
+ if (!$f) {
+ # Manufacturer name may have changed
+ $search =~ s/^.*,//;
+ $f = qx(find Documentation/devicetree/ -iname "*$search*") if ($search);
+ }
+ }
+ }
+
+ # usual reason for breakage: file renamed to .rst
+ if (!$f) {
+ $new =~ s/\.txt$/.rst/;
+ $f=qx(find $basedir -iname $new) if ($new);
+ }
+
+ # usual reason for breakage: use dash or underline
+ if (!$f) {
+ $new =~ s/[-_]/[-_]/g;
+ $f=qx(find $basedir -iname $new) if ($new);
+ }
+
+ # Wild guess: seek for the same name on another place
+ if (!$f) {
+ $f = qx(find $basedir -iname $new) if ($new);
+ }
+
+ my @find = split /\s+/, $f;
+
+ if (!$f) {
+ print STDERR "ERROR: Didn't find a replacement for $ref\n";
+ } elsif (scalar(@find) > 1) {
+ print STDERR "WARNING: Won't auto-replace, as found multiple files close to $ref:\n";
+ foreach my $j (@find) {
+ $j =~ s,^./,,;
+ print STDERR " $j\n";
+ }
+ } else {
+ $f = $find[0];
+ $f =~ s,^./,,;
+ print "INFO: Replacing $ref to $f\n";
+ foreach my $j (qx(git grep -l $ref)) {
+ qx(sed "s\@$ref\@$f\@g" -i $j);
+ }
+ }
+}
diff --git a/tools/docs/features-refresh.sh b/tools/docs/features-refresh.sh
new file mode 100755
index 000000000000..c2288124e94a
--- /dev/null
+++ b/tools/docs/features-refresh.sh
@@ -0,0 +1,98 @@
+#
+# Small script that refreshes the kernel feature support status in place.
+#
+
+for F_FILE in Documentation/features/*/*/arch-support.txt; do
+ F=$(grep "^# Kconfig:" "$F_FILE" | cut -c26-)
+
+ #
+ # Each feature F is identified by a pair (O, K), where 'O' can
+ # be either the empty string (for 'nop') or "not" (the logical
+ # negation operator '!'); other operators are not supported.
+ #
+ O=""
+ K=$F
+ if [[ "$F" == !* ]]; then
+ O="not"
+ K=$(echo $F | sed -e 's/^!//g')
+ fi
+
+ #
+ # F := (O, K) is 'valid' iff there is a Kconfig file (for some
+ # arch) which contains K.
+ #
+ # Notice that this definition entails an 'asymmetry' between
+ # the case 'O = ""' and the case 'O = "not"'. E.g., F may be
+ # _invalid_ if:
+ #
+ # [case 'O = ""']
+ # 1) no arch provides support for F,
+ # 2) K does not exist (e.g., it was renamed/mis-typed);
+ #
+ # [case 'O = "not"']
+ # 3) all archs provide support for F,
+ # 4) as in (2).
+ #
+ # The rationale for adopting this definition (and, thus, for
+ # keeping the asymmetry) is:
+ #
+ # We want to be able to 'detect' (2) (or (4)).
+ #
+ # (1) and (3) may further warn the developers about the fact
+ # that K can be removed.
+ #
+ F_VALID="false"
+ for ARCH_DIR in arch/*/; do
+ K_FILES=$(find $ARCH_DIR -name "Kconfig*")
+ K_GREP=$(grep "$K" $K_FILES)
+ if [ ! -z "$K_GREP" ]; then
+ F_VALID="true"
+ break
+ fi
+ done
+ if [ "$F_VALID" = "false" ]; then
+ printf "WARNING: '%s' is not a valid Kconfig\n" "$F"
+ fi
+
+ T_FILE="$F_FILE.tmp"
+ grep "^#" $F_FILE > $T_FILE
+ echo " -----------------------" >> $T_FILE
+ echo " | arch |status|" >> $T_FILE
+ echo " -----------------------" >> $T_FILE
+ for ARCH_DIR in arch/*/; do
+ ARCH=$(echo $ARCH_DIR | sed -e 's/^arch//g' | sed -e 's/\///g')
+ K_FILES=$(find $ARCH_DIR -name "Kconfig*")
+ K_GREP=$(grep "$K" $K_FILES)
+ #
+ # Arch support status values for (O, K) are updated according
+ # to the following rules.
+ #
+ # - ("", K) is 'supported by a given arch', if there is a
+ # Kconfig file for that arch which contains K;
+ #
+ # - ("not", K) is 'supported by a given arch', if there is
+ # no Kconfig file for that arch which contains K;
+ #
+ # - otherwise: preserve the previous status value (if any),
+ # default to 'not yet supported'.
+ #
+ # Notice that, according these rules, invalid features may be
+ # updated/modified.
+ #
+ if [ "$O" = "" ] && [ ! -z "$K_GREP" ]; then
+ printf " |%12s: | ok |\n" "$ARCH" >> $T_FILE
+ elif [ "$O" = "not" ] && [ -z "$K_GREP" ]; then
+ printf " |%12s: | ok |\n" "$ARCH" >> $T_FILE
+ else
+ S=$(grep -v "^#" "$F_FILE" | grep " $ARCH:")
+ if [ ! -z "$S" ]; then
+ echo "$S" >> $T_FILE
+ else
+ printf " |%12s: | TODO |\n" "$ARCH" \
+ >> $T_FILE
+ fi
+ fi
+ done
+ echo " -----------------------" >> $T_FILE
+ mv $T_FILE $F_FILE
+done
diff --git a/tools/docs/find-unused-docs.sh b/tools/docs/find-unused-docs.sh
new file mode 100755
index 000000000000..05552dbda5bc
--- /dev/null
+++ b/tools/docs/find-unused-docs.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# (c) 2017, Jonathan Corbet <corbet@lwn.net>
+# sayli karnik <karniksayli1995@gmail.com>
+#
+# This script detects files with kernel-doc comments for exported functions
+# that are not included in documentation.
+#
+# usage: Run 'tools/docs/find-unused-docs.sh directory' from top level of kernel
+# tree.
+#
+# example: $tools/docs/find-unused-docs.sh drivers/scsi
+#
+# Licensed under the terms of the GNU GPL License
+
+if ! [ -d "Documentation" ]; then
+ echo "Run from top level of kernel tree"
+ exit 1
+fi
+
+if [ "$#" -ne 1 ]; then
+ echo "Usage: tools/docs/find-unused-docs.sh directory"
+ exit 1
+fi
+
+if ! [ -d "$1" ]; then
+ echo "Directory $1 doesn't exist"
+ exit 1
+fi
+
+cd "$( dirname "${BASH_SOURCE[0]}" )"
+cd ..
+
+cd Documentation/
+
+echo "The following files contain kerneldoc comments for exported functions \
+that are not used in the formatted documentation"
+
+# FILES INCLUDED
+
+files_included=($(grep -rHR ".. kernel-doc" --include \*.rst | cut -d " " -f 3))
+
+declare -A FILES_INCLUDED
+
+for each in "${files_included[@]}"; do
+ FILES_INCLUDED[$each]="$each"
+ done
+
+cd ..
+
+# FILES NOT INCLUDED
+
+for file in `find $1 -name '*.c'`; do
+
+ if [[ ${FILES_INCLUDED[$file]+_} ]]; then
+ continue;
+ fi
+ str=$(PYTHONDONTWRITEBYTECODE=1 scripts/kernel-doc -export "$file" 2>/dev/null)
+ if [[ -n "$str" ]]; then
+ echo "$file"
+ fi
+ done
+
diff --git a/tools/docs/get_abi.py b/tools/docs/get_abi.py
new file mode 100755
index 000000000000..2f0b99401f26
--- /dev/null
+++ b/tools/docs/get_abi.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+# pylint: disable=R0903
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
+# SPDX-License-Identifier: GPL-2.0
+
+"""
+Parse ABI documentation and produce results from it.
+"""
+
+import argparse
+import logging
+import os
+import sys
+
+# Import Python modules
+
+LIB_DIR = "../lib/python"
+SRC_DIR = os.path.dirname(os.path.realpath(__file__))
+
+sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR))
+
+from abi.abi_parser import AbiParser # pylint: disable=C0413
+from abi.abi_regex import AbiRegex # pylint: disable=C0413
+from abi.helpers import ABI_DIR, DEBUG_HELP # pylint: disable=C0413
+from abi.system_symbols import SystemSymbols # pylint: disable=C0413
+
+# Command line classes
+
+
+REST_DESC = """
+Produce output in ReST format.
+
+The output is done on two sections:
+
+- Symbols: show all parsed symbols in alphabetic order;
+- Files: cross reference the content of each file with the symbols on it.
+"""
+
+class AbiRest:
+ """Initialize an argparse subparser for rest output"""
+
+ def __init__(self, subparsers):
+ """Initialize argparse subparsers"""
+
+ parser = subparsers.add_parser("rest",
+ formatter_class=argparse.RawTextHelpFormatter,
+ description=REST_DESC)
+
+ parser.add_argument("--enable-lineno", action="store_true",
+ help="enable lineno")
+ parser.add_argument("--raw", action="store_true",
+ help="output text as contained in the ABI files. "
+ "It not used, output will contain dynamically"
+ " generated cross references when possible.")
+ parser.add_argument("--no-file", action="store_true",
+ help="Don't the files section")
+ parser.add_argument("--show-hints", help="Show-hints")
+
+ parser.set_defaults(func=self.run)
+
+ def run(self, args):
+ """Run subparser"""
+
+ parser = AbiParser(args.dir, debug=args.debug)
+ parser.parse_abi()
+ parser.check_issues()
+
+ for t in parser.doc(args.raw, not args.no_file):
+ if args.enable_lineno:
+ print (f".. LINENO {t[1]}#{t[2]}\n\n")
+
+ print(t[0])
+
+class AbiValidate:
+ """Initialize an argparse subparser for ABI validation"""
+
+ def __init__(self, subparsers):
+ """Initialize argparse subparsers"""
+
+ parser = subparsers.add_parser("validate",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description="list events")
+
+ parser.set_defaults(func=self.run)
+
+ def run(self, args):
+ """Run subparser"""
+
+ parser = AbiParser(args.dir, debug=args.debug)
+ parser.parse_abi()
+ parser.check_issues()
+
+
+class AbiSearch:
+ """Initialize an argparse subparser for ABI search"""
+
+ def __init__(self, subparsers):
+ """Initialize argparse subparsers"""
+
+ parser = subparsers.add_parser("search",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description="Search ABI using a regular expression")
+
+ parser.add_argument("expression",
+ help="Case-insensitive search pattern for the ABI symbol")
+
+ parser.set_defaults(func=self.run)
+
+ def run(self, args):
+ """Run subparser"""
+
+ parser = AbiParser(args.dir, debug=args.debug)
+ parser.parse_abi()
+ parser.search_symbols(args.expression)
+
+UNDEFINED_DESC="""
+Check undefined ABIs on local machine.
+
+Read sysfs devnodes and check if the devnodes there are defined inside
+ABI documentation.
+
+The search logic tries to minimize the number of regular expressions to
+search per each symbol.
+
+By default, it runs on a single CPU, as Python support for CPU threads
+is still experimental, and multi-process runs on Python is very slow.
+
+On experimental tests, if the number of ABI symbols to search per devnode
+is contained on a limit of ~150 regular expressions, using a single CPU
+is a lot faster than using multiple processes. However, if the number of
+regular expressions to check is at the order of ~30000, using multiple
+CPUs speeds up the check.
+"""
+
+class AbiUndefined:
+ """
+ Initialize an argparse subparser for logic to check undefined ABI at
+ the current machine's sysfs
+ """
+
+ def __init__(self, subparsers):
+ """Initialize argparse subparsers"""
+
+ parser = subparsers.add_parser("undefined",
+ formatter_class=argparse.RawTextHelpFormatter,
+ description=UNDEFINED_DESC)
+
+ parser.add_argument("-S", "--sysfs-dir", default="/sys",
+ help="directory where sysfs is mounted")
+ parser.add_argument("-s", "--search-string",
+ help="search string regular expression to limit symbol search")
+ parser.add_argument("-H", "--show-hints", action="store_true",
+ help="Hints about definitions for missing ABI symbols.")
+ parser.add_argument("-j", "--jobs", "--max-workers", type=int, default=1,
+ help="If bigger than one, enables multiprocessing.")
+ parser.add_argument("-c", "--max-chunk-size", type=int, default=50,
+ help="Maximum number of chunk size")
+ parser.add_argument("-f", "--found", action="store_true",
+ help="Also show found items. "
+ "Helpful to debug the parser."),
+ parser.add_argument("-d", "--dry-run", action="store_true",
+ help="Don't actually search for undefined. "
+ "Helpful to debug the parser."),
+
+ parser.set_defaults(func=self.run)
+
+ def run(self, args):
+ """Run subparser"""
+
+ abi = AbiRegex(args.dir, debug=args.debug,
+ search_string=args.search_string)
+
+ abi_symbols = SystemSymbols(abi=abi, hints=args.show_hints,
+ sysfs=args.sysfs_dir)
+
+ abi_symbols.check_undefined_symbols(dry_run=args.dry_run,
+ found=args.found,
+ max_workers=args.jobs,
+ chunk_size=args.max_chunk_size)
+
+
+def main():
+ """Main program"""
+
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+
+ parser.add_argument("-d", "--debug", type=int, default=0, help="debug level")
+ parser.add_argument("-D", "--dir", default=ABI_DIR, help=DEBUG_HELP)
+
+ subparsers = parser.add_subparsers()
+
+ AbiRest(subparsers)
+ AbiValidate(subparsers)
+ AbiSearch(subparsers)
+ AbiUndefined(subparsers)
+
+ args = parser.parse_args()
+
+ if args.debug:
+ level = logging.DEBUG
+ else:
+ level = logging.INFO
+
+ logging.basicConfig(level=level, format="[%(levelname)s] %(message)s")
+
+ if "func" in args:
+ args.func(args)
+ else:
+ sys.exit(f"Please specify a valid command for {sys.argv[0]}")
+
+
+# Call main method
+if __name__ == "__main__":
+ main()
diff --git a/tools/docs/get_feat.py b/tools/docs/get_feat.py
new file mode 100755
index 000000000000..2b5155a1f134
--- /dev/null
+++ b/tools/docs/get_feat.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+# pylint: disable=R0902,R0911,R0912,R0914,R0915
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
+# SPDX-License-Identifier: GPL-2.0
+
+
+"""
+Parse the Linux Feature files and produce a ReST book.
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+
+from pprint import pprint
+
+LIB_DIR = "../../tools/lib/python"
+SRC_DIR = os.path.dirname(os.path.realpath(__file__))
+
+sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR))
+
+from feat.parse_features import ParseFeature # pylint: disable=C0413
+
+SRCTREE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../..")
+DEFAULT_DIR = "Documentation/features"
+
+
+class GetFeature:
+ """Helper class to parse feature parsing parameters"""
+
+ @staticmethod
+ def get_current_arch():
+ """Detects the current architecture"""
+
+ proc = subprocess.run(["uname", "-m"], check=True,
+ capture_output=True, text=True)
+
+ arch = proc.stdout.strip()
+ if arch in ["x86_64", "i386"]:
+ arch = "x86"
+ elif arch == "s390x":
+ arch = "s390"
+
+ return arch
+
+ def run_parser(self, args):
+ """Execute the feature parser"""
+
+ feat = ParseFeature(args.directory, args.debug, args.enable_fname)
+ data = feat.parse()
+
+ if args.debug > 2:
+ pprint(data)
+
+ return feat
+
+ def run_rest(self, args):
+ """
+ Generate tables in ReST format. Three types of tables are
+ supported, depending on the calling arguments:
+
+ - neither feature nor arch is passed: generates a full matrix;
+ - arch provided: generates a table of supported tables for the
+ guiven architecture, eventually filtered by feature;
+ - only feature provided: generates a table with feature details,
+ showing what architectures it is implemented.
+ """
+
+ feat = self.run_parser(args)
+
+ if args.arch:
+ rst = feat.output_arch_table(args.arch, args.feat)
+ elif args.feat:
+ rst = feat.output_feature(args.feat)
+ else:
+ rst = feat.output_matrix()
+
+ print(rst)
+
+ def run_current(self, args):
+ """
+ Instead of using a --arch parameter, get feature for the current
+ architecture.
+ """
+
+ args.arch = self.get_current_arch()
+
+ self.run_rest(args)
+
+ def run_list(self, args):
+ """
+ Generate a list of features for a given architecture, in a format
+ parseable by other scripts. The output format is not ReST.
+ """
+
+ if not args.arch:
+ args.arch = self.get_current_arch()
+
+ feat = self.run_parser(args)
+ msg = feat.list_arch_features(args.arch, args.feat)
+
+ print(msg)
+
+ def parse_arch(self, parser):
+ """Add a --arch parsing argument"""
+
+ parser.add_argument("--arch",
+ help="Output features for an specific"
+ " architecture, optionally filtering for a "
+ "single specific feature.")
+
+ def parse_feat(self, parser):
+ """Add a --feat parsing argument"""
+
+ parser.add_argument("--feat", "--feature",
+ help="Output features for a single specific "
+ "feature.")
+
+
+ def current_args(self, subparsers):
+ """Implementscurrent argparse subparser"""
+
+ parser = subparsers.add_parser("current",
+ formatter_class=argparse.RawTextHelpFormatter,
+ description="Output table in ReST "
+ "compatible ASCII format "
+ "with features for this "
+ "machine's architecture")
+
+ self.parse_feat(parser)
+ parser.set_defaults(func=self.run_current)
+
+ def rest_args(self, subparsers):
+ """Implement rest argparse subparser"""
+
+ parser = subparsers.add_parser("rest",
+ formatter_class=argparse.RawTextHelpFormatter,
+ description="Output table(s) in ReST "
+ "compatible ASCII format "
+ "with features in ReST "
+ "markup language. The "
+ "output is affected by "
+ "--arch or --feat/--feature"
+ " flags.")
+
+ self.parse_arch(parser)
+ self.parse_feat(parser)
+ parser.set_defaults(func=self.run_rest)
+
+ def list_args(self, subparsers):
+ """Implement list argparse subparser"""
+
+ parser = subparsers.add_parser("list",
+ formatter_class=argparse.RawTextHelpFormatter,
+ description="List features for this "
+ "machine's architecture, "
+ "using an easier to parse "
+ "format. The output is "
+ "affected by --arch flag.")
+
+ self.parse_arch(parser)
+ self.parse_feat(parser)
+ parser.set_defaults(func=self.run_list)
+
+ def validate_args(self, subparsers):
+ """Implement validate argparse subparser"""
+
+ parser = subparsers.add_parser("validate",
+ formatter_class=argparse.RawTextHelpFormatter,
+ description="Validate the contents of "
+ "the files under "
+ f"{DEFAULT_DIR}.")
+
+ parser.set_defaults(func=self.run_parser)
+
+ def parser(self):
+ """
+ Create an arparse with common options and several subparsers
+ """
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+
+ parser.add_argument("-d", "--debug", action="count", default=0,
+ help="Put the script in verbose mode, useful for "
+ "debugging. Can be called multiple times, to "
+ "increase verbosity.")
+
+ parser.add_argument("--directory", "--dir", default=DEFAULT_DIR,
+ help="Changes the location of the Feature files. "
+ f"By default, it uses the {DEFAULT_DIR} "
+ "directory.")
+
+ parser.add_argument("--enable-fname", action="store_true",
+ help="Prints the file name of the feature files. "
+ "This can be used in order to track "
+ "dependencies during documentation build.")
+
+ subparsers = parser.add_subparsers()
+
+ self.current_args(subparsers)
+ self.rest_args(subparsers)
+ self.list_args(subparsers)
+ self.validate_args(subparsers)
+
+ args = parser.parse_args()
+
+ return args
+
+
+def main():
+ """Main program"""
+
+ feat = GetFeature()
+
+ args = feat.parser()
+
+ if "func" in args:
+ args.func(args)
+ else:
+ sys.exit(f"Please specify a valid command for {sys.argv[0]}")
+
+
+# Call main method
+if __name__ == "__main__":
+ main()
diff --git a/tools/docs/list-arch.sh b/tools/docs/list-arch.sh
new file mode 100755
index 000000000000..96fe83b7058b
--- /dev/null
+++ b/tools/docs/list-arch.sh
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Small script that visualizes the kernel feature support status
+# of an architecture.
+#
+# (If no arguments are given then it will print the host architecture's status.)
+#
+
+ARCH=${1:-$(uname -m | sed 's/x86_64/x86/' | sed 's/i386/x86/' | sed 's/s390x/s390/')}
+
+$(dirname $0)/get_feat.pl list --arch $ARCH
diff --git a/tools/docs/parse-headers.py b/tools/docs/parse-headers.py
index bfa4e46a53e3..436acea4c6ca 100755
--- a/tools/docs/parse-headers.py
+++ b/tools/docs/parse-headers.py
@@ -24,10 +24,13 @@ The optional ``FILE_RULES`` contains a set of rules like:
replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
"""
-import argparse
+import argparse, sys
+import os.path
-from lib.parse_data_structs import ParseDataStructs
-from lib.enrich_formatter import EnrichFormatter
+src_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, os.path.join(src_dir, '../lib/python'))
+from kdoc.parse_data_structs import ParseDataStructs
+from kdoc.enrich_formatter import EnrichFormatter
def main():
"""Main function"""
@@ -47,10 +50,7 @@ def main():
args = parser.parse_args()
parser = ParseDataStructs(debug=args.debug)
- parser.parse_file(args.file_in)
-
- if args.file_rules:
- parser.process_exceptions(args.file_rules)
+ parser.parse_file(args.file_in, args.file_rules)
parser.debug_print()
parser.write_output(args.file_in, args.file_out, args.toc)
diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper
new file mode 100755
index 000000000000..7a5fcef25429
--- /dev/null
+++ b/tools/docs/sphinx-build-wrapper
@@ -0,0 +1,864 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2025 Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+#
+# pylint: disable=R0902, R0912, R0913, R0914, R0915, R0917, C0103
+#
+# Converted from docs Makefile and parallel-wrapper.sh, both under
+# GPLv2, copyrighted since 2008 by the following authors:
+#
+# Akira Yokosawa <akiyks@gmail.com>
+# Arnd Bergmann <arnd@arndb.de>
+# Breno Leitao <leitao@debian.org>
+# Carlos Bilbao <carlos.bilbao@amd.com>
+# Dave Young <dyoung@redhat.com>
+# Donald Hunter <donald.hunter@gmail.com>
+# Geert Uytterhoeven <geert+renesas@glider.be>
+# Jani Nikula <jani.nikula@intel.com>
+# Jan Stancek <jstancek@redhat.com>
+# Jonathan Corbet <corbet@lwn.net>
+# Joshua Clayton <stillcompiling@gmail.com>
+# Kees Cook <keescook@chromium.org>
+# Linus Torvalds <torvalds@linux-foundation.org>
+# Magnus Damm <damm+renesas@opensource.se>
+# Masahiro Yamada <masahiroy@kernel.org>
+# Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+# Maxim Cournoyer <maxim.cournoyer@gmail.com>
+# Peter Foley <pefoley2@pefoley.com>
+# Randy Dunlap <rdunlap@infradead.org>
+# Rob Herring <robh@kernel.org>
+# Shuah Khan <shuahkh@osg.samsung.com>
+# Thorsten Blum <thorsten.blum@toblux.com>
+# Tomas Winkler <tomas.winkler@intel.com>
+
+
+"""
+Sphinx build wrapper that handles Kernel-specific business rules:
+
+- it gets the Kernel build environment vars;
+- it determines what's the best parallelism;
+- it handles SPHINXDIRS
+
+This tool ensures that MIN_PYTHON_VERSION is satisfied. If version is
+below that, it seeks for a new Python version. If found, it re-runs using
+the newer version.
+"""
+
+import argparse
+import locale
+import os
+import re
+import shlex
+import shutil
+import subprocess
+import sys
+
+from concurrent import futures
+from glob import glob
+
+
+LIB_DIR = "../lib/python"
+SRC_DIR = os.path.dirname(os.path.realpath(__file__))
+
+sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR))
+
+from kdoc.python_version import PythonVersion
+from kdoc.latex_fonts import LatexFontChecker
+from jobserver import JobserverExec # pylint: disable=C0413,C0411,E0401
+
+#
+# Some constants
+#
+VENV_DEFAULT = "sphinx_latest"
+MIN_PYTHON_VERSION = PythonVersion("3.7").version
+PAPER = ["", "a4", "letter"]
+
+TARGETS = {
+ "cleandocs": { "builder": "clean" },
+ "linkcheckdocs": { "builder": "linkcheck" },
+ "htmldocs": { "builder": "html" },
+ "epubdocs": { "builder": "epub", "out_dir": "epub" },
+ "texinfodocs": { "builder": "texinfo", "out_dir": "texinfo" },
+ "infodocs": { "builder": "texinfo", "out_dir": "texinfo" },
+ "mandocs": { "builder": "man", "out_dir": "man" },
+ "latexdocs": { "builder": "latex", "out_dir": "latex" },
+ "pdfdocs": { "builder": "latex", "out_dir": "latex" },
+ "xmldocs": { "builder": "xml", "out_dir": "xml" },
+}
+
+
+#
+# SphinxBuilder class
+#
+
+class SphinxBuilder:
+ """
+ Handles a sphinx-build target, adding needed arguments to build
+ with the Kernel.
+ """
+
+ def get_path(self, path, use_cwd=False, abs_path=False):
+ """
+ Ancillary routine to handle patches the right way, as shell does.
+
+ It first expands "~" and "~user". Then, if patch is not absolute,
+ join self.srctree. Finally, if requested, convert to abspath.
+ """
+
+ path = os.path.expanduser(path)
+ if not path.startswith("/"):
+ if use_cwd:
+ base = os.getcwd()
+ else:
+ base = self.srctree
+
+ path = os.path.join(base, path)
+
+ if abs_path:
+ return os.path.abspath(path)
+
+ return path
+
+ def check_rust(self):
+ """
+ Checks if Rust is enabled
+ """
+ self.rustdoc = False
+
+ config = os.path.join(self.srctree, ".config")
+
+ if not os.path.isfile(config):
+ return
+
+ re_rust = re.compile(r"CONFIG_RUST=(m|y)")
+
+ try:
+ with open(config, "r", encoding="utf-8") as fp:
+ for line in fp:
+ if re_rust.match(line):
+ self.rustdoc = True
+ return
+
+ except OSError as e:
+ print(f"Failed to open {config}", file=sys.stderr)
+
+ def get_sphinx_extra_opts(self, n_jobs):
+ """
+ Get the number of jobs to be used for docs build passed via command
+ line and desired sphinx verbosity.
+
+ The number of jobs can be on different places:
+
+ 1) It can be passed via "-j" argument;
+ 2) The SPHINXOPTS="-j8" env var may have "-j";
+ 3) if called via GNU make, -j specifies the desired number of jobs.
+ with GNU makefile, this number is available via POSIX jobserver;
+ 4) if none of the above is available, it should default to "-jauto",
+ and let sphinx decide the best value.
+ """
+
+ #
+ # SPHINXOPTS env var, if used, contains extra arguments to be used
+ # by sphinx-build time. Among them, it may contain sphinx verbosity
+ # and desired number of parallel jobs.
+ #
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-j', '--jobs', type=int)
+ parser.add_argument('-q', '--quiet', action='store_true')
+
+ #
+ # Other sphinx-build arguments go as-is, so place them
+ # at self.sphinxopts, using shell parser
+ #
+ sphinxopts = shlex.split(os.environ.get("SPHINXOPTS", ""))
+
+ #
+ # Build a list of sphinx args, honoring verbosity here if specified
+ #
+
+ verbose = self.verbose
+ sphinx_args, self.sphinxopts = parser.parse_known_args(sphinxopts)
+ if sphinx_args.quiet is True:
+ verbose = False
+
+ #
+ # If the user explicitly sets "-j" at command line, use it.
+ # Otherwise, pick it from SPHINXOPTS args
+ #
+ if n_jobs:
+ self.n_jobs = n_jobs
+ elif sphinx_args.jobs:
+ self.n_jobs = sphinx_args.jobs
+ else:
+ self.n_jobs = None
+
+ if not verbose:
+ self.sphinxopts += ["-q"]
+
+ def __init__(self, builddir, venv=None, verbose=False, n_jobs=None,
+ interactive=None):
+ """Initialize internal variables"""
+ self.venv = venv
+ self.verbose = None
+
+ #
+ # Normal variables passed from Kernel's makefile
+ #
+ self.kernelversion = os.environ.get("KERNELVERSION", "unknown")
+ self.kernelrelease = os.environ.get("KERNELRELEASE", "unknown")
+ self.pdflatex = os.environ.get("PDFLATEX", "xelatex")
+
+ #
+ # Kernel main Makefile defines a PYTHON3 variable whose default is
+ # "python3". When set to a different value, it allows running a
+ # diferent version than the default official python3 package.
+ # Several distros package python3xx-sphinx packages with newer
+ # versions of Python and sphinx-build.
+ #
+ # Honor such variable different than default
+ #
+ self.python = os.environ.get("PYTHON3")
+ if self.python == "python3":
+ self.python = None
+
+ if not interactive:
+ self.latexopts = os.environ.get("LATEXOPTS", "-interaction=batchmode -no-shell-escape")
+ else:
+ self.latexopts = os.environ.get("LATEXOPTS", "")
+
+ if not verbose:
+ verbose = bool(os.environ.get("KBUILD_VERBOSE", "") != "")
+
+ if verbose is not None:
+ self.verbose = verbose
+
+ #
+ # Source tree directory. This needs to be at os.environ, as
+ # Sphinx extensions use it
+ #
+ self.srctree = os.environ.get("srctree")
+ if not self.srctree:
+ self.srctree = "."
+ os.environ["srctree"] = self.srctree
+
+ #
+ # Now that we can expand srctree, get other directories as well
+ #
+ self.sphinxbuild = os.environ.get("SPHINXBUILD", "sphinx-build")
+ self.kerneldoc = self.get_path(os.environ.get("KERNELDOC",
+ "scripts/kernel-doc.py"))
+ self.builddir = self.get_path(builddir, use_cwd=True, abs_path=True)
+
+ #
+ # Get directory locations for LaTeX build toolchain
+ #
+ self.pdflatex_cmd = shutil.which(self.pdflatex)
+ self.latexmk_cmd = shutil.which("latexmk")
+
+ self.env = os.environ.copy()
+
+ self.get_sphinx_extra_opts(n_jobs)
+
+ self.check_rust()
+
+ #
+ # If venv command line argument is specified, run Sphinx from venv
+ #
+ if venv:
+ bin_dir = os.path.join(venv, "bin")
+ if not os.path.isfile(os.path.join(bin_dir, "activate")):
+ sys.exit(f"Venv {venv} not found.")
+
+ # "activate" virtual env
+ self.env["PATH"] = bin_dir + ":" + self.env["PATH"]
+ self.env["VIRTUAL_ENV"] = venv
+ if "PYTHONHOME" in self.env:
+ del self.env["PYTHONHOME"]
+ print(f"Setting venv to {venv}")
+
+ def run_sphinx(self, sphinx_build, build_args, *args, **pwargs):
+ """
+ Executes sphinx-build using current python3 command.
+
+ When calling via GNU make, POSIX jobserver is used to tell how
+ many jobs are still available from a job pool. claim all remaining
+ jobs, as we don't want sphinx-build to run in parallel with other
+ jobs.
+
+ Despite that, the user may actually force a different value than
+ the number of available jobs via command line.
+
+ The "with" logic here is used to ensure that the claimed jobs will
+ be freed once subprocess finishes
+ """
+
+ with JobserverExec() as jobserver:
+ if jobserver.claim:
+ #
+ # when GNU make is used, claim available jobs from jobserver
+ #
+ n_jobs = str(jobserver.claim)
+ else:
+ #
+ # Otherwise, let sphinx decide by default
+ #
+ n_jobs = "auto"
+
+ #
+ # If explicitly requested via command line, override default
+ #
+ if self.n_jobs:
+ n_jobs = str(self.n_jobs)
+
+ #
+ # We can't simply call python3 sphinx-build, as OpenSUSE
+ # Tumbleweed uses an ELF binary file (/usr/bin/alts) to switch
+ # between different versions of sphinx-build. So, only call it
+ # prepending "python3.xx" when PYTHON3 variable is not default.
+ #
+ if self.python:
+ cmd = [self.python]
+ else:
+ cmd = []
+
+ cmd += [sphinx_build]
+ cmd += [f"-j{n_jobs}"]
+ cmd += build_args
+ cmd += self.sphinxopts
+
+ if self.verbose:
+ print(" ".join(cmd))
+
+ return subprocess.call(cmd, *args, **pwargs)
+
+ def handle_html(self, css, output_dir):
+ """
+ Extra steps for HTML and epub output.
+
+ For such targets, we need to ensure that CSS will be properly
+ copied to the output _static directory
+ """
+
+ if css:
+ css = os.path.expanduser(css)
+ if not css.startswith("/"):
+ css = os.path.join(self.srctree, css)
+
+ static_dir = os.path.join(output_dir, "_static")
+ os.makedirs(static_dir, exist_ok=True)
+
+ try:
+ shutil.copy2(css, static_dir)
+ except (OSError, IOError) as e:
+ print(f"Warning: Failed to copy CSS: {e}", file=sys.stderr)
+
+ if self.rustdoc:
+ print("Building rust docs")
+ if "MAKE" in self.env:
+ cmd = [self.env["MAKE"]]
+ else:
+ cmd = ["make", "LLVM=1"]
+
+ cmd += [ "rustdoc"]
+ if self.verbose:
+ print(" ".join(cmd))
+
+ try:
+ subprocess.run(cmd, check=True)
+ except subprocess.CalledProcessError as e:
+ print(f"Ignored errors when building rustdoc: {e}. Is RUST enabled?",
+ file=sys.stderr)
+
+ def build_pdf_file(self, latex_cmd, from_dir, path):
+ """Builds a single pdf file using latex_cmd"""
+ try:
+ subprocess.run(latex_cmd + [path],
+ cwd=from_dir, check=True, env=self.env)
+
+ return True
+ except subprocess.CalledProcessError:
+ return False
+
+ def pdf_parallel_build(self, tex_suffix, latex_cmd, tex_files, n_jobs):
+ """Build PDF files in parallel if possible"""
+ builds = {}
+ build_failed = False
+ max_len = 0
+ has_tex = False
+
+ #
+ # LaTeX PDF error code is almost useless for us:
+ # any warning makes it non-zero. For kernel doc builds it always return
+ # non-zero even when build succeeds. So, let's do the best next thing:
+ # Ignore build errors. At the end, check if all PDF files were built,
+ # printing a summary with the built ones and returning 0 if all of
+ # them were actually built.
+ #
+ with futures.ThreadPoolExecutor(max_workers=n_jobs) as executor:
+ jobs = {}
+
+ for from_dir, pdf_dir, entry in tex_files:
+ name = entry.name
+
+ if not name.endswith(tex_suffix):
+ continue
+
+ name = name[:-len(tex_suffix)]
+ has_tex = True
+
+ future = executor.submit(self.build_pdf_file, latex_cmd,
+ from_dir, entry.path)
+ jobs[future] = (from_dir, pdf_dir, name)
+
+ for future in futures.as_completed(jobs):
+ from_dir, pdf_dir, name = jobs[future]
+
+ pdf_name = name + ".pdf"
+ pdf_from = os.path.join(from_dir, pdf_name)
+ pdf_to = os.path.join(pdf_dir, pdf_name)
+ out_name = os.path.relpath(pdf_to, self.builddir)
+ max_len = max(max_len, len(out_name))
+
+ try:
+ success = future.result()
+
+ if success and os.path.exists(pdf_from):
+ os.rename(pdf_from, pdf_to)
+
+ #
+ # if verbose, get the name of built PDF file
+ #
+ if self.verbose:
+ builds[out_name] = "SUCCESS"
+ else:
+ builds[out_name] = "FAILED"
+ build_failed = True
+ except futures.Error as e:
+ builds[out_name] = f"FAILED ({repr(e)})"
+ build_failed = True
+
+ #
+ # Handle case where no .tex files were found
+ #
+ if not has_tex:
+ out_name = "LaTeX files"
+ max_len = max(max_len, len(out_name))
+ builds[out_name] = "FAILED: no .tex files were generated"
+ build_failed = True
+
+ return builds, build_failed, max_len
+
+ def handle_pdf(self, output_dirs, deny_vf):
+ """
+ Extra steps for PDF output.
+
+ As PDF is handled via a LaTeX output, after building the .tex file,
+ a new build is needed to create the PDF output from the latex
+ directory.
+ """
+ builds = {}
+ max_len = 0
+ tex_suffix = ".tex"
+ tex_files = []
+
+ #
+ # Since early 2024, Fedora and openSUSE tumbleweed have started
+ # deploying variable-font format of "Noto CJK", causing LaTeX
+ # to break with CJK. Work around it, by denying the variable font
+ # usage during xelatex build by passing the location of a config
+ # file with a deny list.
+ #
+ # See tools/docs/lib/latex_fonts.py for more details.
+ #
+ if deny_vf:
+ deny_vf = os.path.expanduser(deny_vf)
+ if os.path.isdir(deny_vf):
+ self.env["XDG_CONFIG_HOME"] = deny_vf
+
+ for from_dir in output_dirs:
+ pdf_dir = os.path.join(from_dir, "../pdf")
+ os.makedirs(pdf_dir, exist_ok=True)
+
+ if self.latexmk_cmd:
+ latex_cmd = [self.latexmk_cmd, f"-{self.pdflatex}"]
+ else:
+ latex_cmd = [self.pdflatex]
+
+ latex_cmd.extend(shlex.split(self.latexopts))
+
+ # Get a list of tex files to process
+ with os.scandir(from_dir) as it:
+ for entry in it:
+ if entry.name.endswith(tex_suffix):
+ tex_files.append((from_dir, pdf_dir, entry))
+
+ #
+ # When using make, this won't be used, as the number of jobs comes
+ # from POSIX jobserver. So, this covers the case where build comes
+ # from command line. On such case, serialize by default, except if
+ # the user explicitly sets the number of jobs.
+ #
+ n_jobs = 1
+
+ # n_jobs is either an integer or "auto". Only use it if it is a number
+ if self.n_jobs:
+ try:
+ n_jobs = int(self.n_jobs)
+ except ValueError:
+ pass
+
+ #
+ # When using make, jobserver.claim is the number of jobs that were
+ # used with "-j" and that aren't used by other make targets
+ #
+ with JobserverExec() as jobserver:
+ n_jobs = 1
+
+ #
+ # Handle the case when a parameter is passed via command line,
+ # using it as default, if jobserver doesn't claim anything
+ #
+ if self.n_jobs:
+ try:
+ n_jobs = int(self.n_jobs)
+ except ValueError:
+ pass
+
+ if jobserver.claim:
+ n_jobs = jobserver.claim
+
+ builds, build_failed, max_len = self.pdf_parallel_build(tex_suffix,
+ latex_cmd,
+ tex_files,
+ n_jobs)
+
+ #
+ # In verbose mode, print a summary with the build results per file.
+ # Otherwise, print a single line with all failures, if any.
+ # On both cases, return code 1 indicates build failures,
+ #
+ if self.verbose:
+ msg = "Summary"
+ msg += "\n" + "=" * len(msg)
+ print()
+ print(msg)
+
+ for pdf_name, pdf_file in builds.items():
+ print(f"{pdf_name:<{max_len}}: {pdf_file}")
+
+ print()
+ if build_failed:
+ msg = LatexFontChecker().check()
+ if msg:
+ print(msg)
+
+ sys.exit("Error: not all PDF files were created.")
+
+ elif build_failed:
+ n_failures = len(builds)
+ failures = ", ".join(builds.keys())
+
+ msg = LatexFontChecker().check()
+ if msg:
+ print(msg)
+
+ sys.exit(f"Error: Can't build {n_failures} PDF file(s): {failures}")
+
+ def handle_info(self, output_dirs):
+ """
+ Extra steps for Info output.
+
+ For texinfo generation, an additional make is needed from the
+ texinfo directory.
+ """
+
+ for output_dir in output_dirs:
+ try:
+ subprocess.run(["make", "info"], cwd=output_dir, check=True)
+ except subprocess.CalledProcessError as e:
+ sys.exit(f"Error generating info docs: {e}")
+
+ def handle_man(self, kerneldoc, docs_dir, src_dir, output_dir):
+ """
+ Create man pages from kernel-doc output
+ """
+
+ re_kernel_doc = re.compile(r"^\.\.\s+kernel-doc::\s*(\S+)")
+ re_man = re.compile(r'^\.TH "[^"]*" (\d+) "([^"]*)"')
+
+ if docs_dir == src_dir:
+ #
+ # Pick the entire set of kernel-doc markups from the entire tree
+ #
+ kdoc_files = set([self.srctree])
+ else:
+ kdoc_files = set()
+
+ for fname in glob(os.path.join(src_dir, "**"), recursive=True):
+ if os.path.isfile(fname) and fname.endswith(".rst"):
+ with open(fname, "r", encoding="utf-8") as in_fp:
+ data = in_fp.read()
+
+ for line in data.split("\n"):
+ match = re_kernel_doc.match(line)
+ if match:
+ if os.path.isfile(match.group(1)):
+ kdoc_files.add(match.group(1))
+
+ if not kdoc_files:
+ sys.exit(f"Directory {src_dir} doesn't contain kernel-doc tags")
+
+ cmd = [ kerneldoc, "-m" ] + sorted(kdoc_files)
+ try:
+ if self.verbose:
+ print(" ".join(cmd))
+
+ result = subprocess.run(cmd, stdout=subprocess.PIPE, text= True)
+
+ if result.returncode:
+ print(f"Warning: kernel-doc returned {result.returncode} warnings")
+
+ except (OSError, ValueError, subprocess.SubprocessError) as e:
+ sys.exit(f"Failed to create man pages for {src_dir}: {repr(e)}")
+
+ fp = None
+ try:
+ for line in result.stdout.split("\n"):
+ match = re_man.match(line)
+ if not match:
+ if fp:
+ fp.write(line + '\n')
+ continue
+
+ if fp:
+ fp.close()
+
+ fname = f"{output_dir}/{match.group(2)}.{match.group(1)}"
+
+ if self.verbose:
+ print(f"Creating {fname}")
+ fp = open(fname, "w", encoding="utf-8")
+ fp.write(line + '\n')
+ finally:
+ if fp:
+ fp.close()
+
+ def cleandocs(self, builder): # pylint: disable=W0613
+ """Remove documentation output directory"""
+ shutil.rmtree(self.builddir, ignore_errors=True)
+
+ def build(self, target, sphinxdirs=None,
+ theme=None, css=None, paper=None, deny_vf=None,
+ skip_sphinx=False):
+ """
+ Build documentation using Sphinx. This is the core function of this
+ module. It prepares all arguments required by sphinx-build.
+ """
+
+ builder = TARGETS[target]["builder"]
+ out_dir = TARGETS[target].get("out_dir", "")
+
+ #
+ # Cleandocs doesn't require sphinx-build
+ #
+ if target == "cleandocs":
+ self.cleandocs(builder)
+ return
+
+ if theme:
+ os.environ["DOCS_THEME"] = theme
+
+ #
+ # Other targets require sphinx-build, so check if it exists
+ #
+ if not skip_sphinx:
+ sphinxbuild = shutil.which(self.sphinxbuild, path=self.env["PATH"])
+ if not sphinxbuild and target != "mandocs":
+ sys.exit(f"Error: {self.sphinxbuild} not found in PATH.\n")
+
+ if target == "pdfdocs":
+ if not self.pdflatex_cmd and not self.latexmk_cmd:
+ sys.exit("Error: pdflatex or latexmk required for PDF generation")
+
+ docs_dir = os.path.abspath(os.path.join(self.srctree, "Documentation"))
+
+ #
+ # Fill in base arguments for Sphinx build
+ #
+ kerneldoc = self.kerneldoc
+ if kerneldoc.startswith(self.srctree):
+ kerneldoc = os.path.relpath(kerneldoc, self.srctree)
+
+ args = [ "-b", builder, "-c", docs_dir ]
+
+ if builder == "latex":
+ if not paper:
+ paper = PAPER[1]
+
+ args.extend(["-D", f"latex_elements.papersize={paper}paper"])
+
+ if self.rustdoc:
+ args.extend(["-t", "rustdoc"])
+
+ if not sphinxdirs:
+ sphinxdirs = os.environ.get("SPHINXDIRS", ".")
+
+ #
+ # The sphinx-build tool has a bug: internally, it tries to set
+ # locale with locale.setlocale(locale.LC_ALL, ''). This causes a
+ # crash if language is not set. Detect and fix it.
+ #
+ try:
+ locale.setlocale(locale.LC_ALL, '')
+ except locale.Error:
+ self.env["LC_ALL"] = "C"
+
+ #
+ # sphinxdirs can be a list or a whitespace-separated string
+ #
+ sphinxdirs_list = []
+ for sphinxdir in sphinxdirs:
+ if isinstance(sphinxdir, list):
+ sphinxdirs_list += sphinxdir
+ else:
+ sphinxdirs_list += sphinxdir.split()
+
+ #
+ # Step 1: Build each directory in separate.
+ #
+ # This is not the best way of handling it, as cross-references between
+ # them will be broken, but this is what we've been doing since
+ # the beginning.
+ #
+ output_dirs = []
+ for sphinxdir in sphinxdirs_list:
+ src_dir = os.path.join(docs_dir, sphinxdir)
+ doctree_dir = os.path.join(self.builddir, ".doctrees")
+ output_dir = os.path.join(self.builddir, sphinxdir, out_dir)
+
+ #
+ # Make directory names canonical
+ #
+ src_dir = os.path.normpath(src_dir)
+ doctree_dir = os.path.normpath(doctree_dir)
+ output_dir = os.path.normpath(output_dir)
+
+ os.makedirs(doctree_dir, exist_ok=True)
+ os.makedirs(output_dir, exist_ok=True)
+
+ output_dirs.append(output_dir)
+
+ build_args = args + [
+ "-d", doctree_dir,
+ "-D", f"kerneldoc_bin={kerneldoc}",
+ "-D", f"version={self.kernelversion}",
+ "-D", f"release={self.kernelrelease}",
+ "-D", f"kerneldoc_srctree={self.srctree}",
+ src_dir,
+ output_dir,
+ ]
+
+ if target == "mandocs":
+ self.handle_man(kerneldoc, docs_dir, src_dir, output_dir)
+ elif not skip_sphinx:
+ try:
+ result = self.run_sphinx(sphinxbuild, build_args,
+ env=self.env)
+
+ if result:
+ sys.exit(f"Build failed: return code: {result}")
+
+ except (OSError, ValueError, subprocess.SubprocessError) as e:
+ sys.exit(f"Build failed: {repr(e)}")
+
+ #
+ # Ensure that each html/epub output will have needed static files
+ #
+ if target in ["htmldocs", "epubdocs"]:
+ self.handle_html(css, output_dir)
+
+ #
+ # Step 2: Some targets (PDF and info) require an extra step once
+ # sphinx-build finishes
+ #
+ if target == "pdfdocs":
+ self.handle_pdf(output_dirs, deny_vf)
+ elif target == "infodocs":
+ self.handle_info(output_dirs)
+
+def jobs_type(value):
+ """
+ Handle valid values for -j. Accepts Sphinx "-jauto", plus a number
+ equal or bigger than one.
+ """
+ if value is None:
+ return None
+
+ if value.lower() == 'auto':
+ return value.lower()
+
+ try:
+ if int(value) >= 1:
+ return value
+
+ raise argparse.ArgumentTypeError(f"Minimum jobs is 1, got {value}")
+ except ValueError:
+ raise argparse.ArgumentTypeError(f"Must be 'auto' or positive integer, got {value}") # pylint: disable=W0707
+
+def main():
+ """
+ Main function. The only mandatory argument is the target. If not
+ specified, the other arguments will use default values if not
+ specified at os.environ.
+ """
+ parser = argparse.ArgumentParser(description="Kernel documentation builder")
+
+ parser.add_argument("target", choices=list(TARGETS.keys()),
+ help="Documentation target to build")
+ parser.add_argument("--sphinxdirs", nargs="+",
+ help="Specific directories to build")
+ parser.add_argument("--builddir", default="output",
+ help="Sphinx configuration file")
+
+ parser.add_argument("--theme", help="Sphinx theme to use")
+
+ parser.add_argument("--css", help="Custom CSS file for HTML/EPUB")
+
+ parser.add_argument("--paper", choices=PAPER, default=PAPER[0],
+ help="Paper size for LaTeX/PDF output")
+
+ parser.add_argument('--deny-vf',
+ help="Configuration to deny variable fonts on pdf builds")
+
+ parser.add_argument("-v", "--verbose", action='store_true',
+ help="place build in verbose mode")
+
+ parser.add_argument('-j', '--jobs', type=jobs_type,
+ help="Sets number of jobs to use with sphinx-build")
+
+ parser.add_argument('-i', '--interactive', action='store_true',
+ help="Change latex default to run in interactive mode")
+
+ parser.add_argument('-s', '--skip-sphinx-build', action='store_true',
+ help="Skip sphinx-build step")
+
+ parser.add_argument("-V", "--venv", nargs='?', const=f'{VENV_DEFAULT}',
+ default=None,
+ help=f'If used, run Sphinx from a venv dir (default dir: {VENV_DEFAULT})')
+
+ args = parser.parse_args()
+
+ PythonVersion.check_python(MIN_PYTHON_VERSION, show_alternatives=True,
+ bail_out=True)
+
+ builder = SphinxBuilder(builddir=args.builddir, venv=args.venv,
+ verbose=args.verbose, n_jobs=args.jobs,
+ interactive=args.interactive)
+
+ builder.build(args.target, sphinxdirs=args.sphinxdirs,
+ theme=args.theme, css=args.css, paper=args.paper,
+ deny_vf=args.deny_vf,
+ skip_sphinx=args.skip_sphinx_build)
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/docs/sphinx-pre-install b/tools/docs/sphinx-pre-install
new file mode 100755
index 000000000000..965c9b093a41
--- /dev/null
+++ b/tools/docs/sphinx-pre-install
@@ -0,0 +1,1543 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2017-2025 Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+#
+# pylint: disable=C0103,C0114,C0115,C0116,C0301,C0302
+# pylint: disable=R0902,R0904,R0911,R0912,R0914,R0915,R1705,R1710,E1121
+
+# Note: this script requires at least Python 3.6 to run.
+# Don't add changes not compatible with it, it is meant to report
+# incompatible python versions.
+
+"""
+Dependency checker for Sphinx documentation Kernel build.
+
+This module provides tools to check for all required dependencies needed to
+build documentation using Sphinx, including system packages, Python modules
+and LaTeX packages for PDF generation.
+
+It detect packages for a subset of Linux distributions used by Kernel
+maintainers, showing hints and missing dependencies.
+
+The main class SphinxDependencyChecker handles the dependency checking logic
+and provides recommendations for installing missing packages. It supports both
+system package installations and Python virtual environments. By default,
+system pacage install is recommended.
+"""
+
+import argparse
+import locale
+import os
+import re
+import subprocess
+import sys
+from glob import glob
+import os.path
+
+src_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, os.path.join(src_dir, '../lib/python'))
+from kdoc.python_version import PythonVersion
+
+RECOMMENDED_VERSION = PythonVersion("3.4.3").version
+MIN_PYTHON_VERSION = PythonVersion("3.7").version
+
+
+class DepManager:
+ """
+ Manage package dependencies. There are three types of dependencies:
+
+ - System: dependencies required for docs build;
+ - Python: python dependencies for a native distro Sphinx install;
+ - PDF: dependencies needed by PDF builds.
+
+ Each dependency can be mandatory or optional. Not installing an optional
+ dependency won't break the build, but will cause degradation at the
+ docs output.
+ """
+
+ # Internal types of dependencies. Don't use them outside DepManager class.
+ _SYS_TYPE = 0
+ _PHY_TYPE = 1
+ _PDF_TYPE = 2
+
+ # Dependencies visible outside the class.
+ # The keys are tuple with: (type, is_mandatory flag).
+ #
+ # Currently we're not using all optional dep types. Yet, we'll keep all
+ # possible combinations here. They're not many, and that makes easier
+ # if later needed and for the name() method below
+
+ SYSTEM_MANDATORY = (_SYS_TYPE, True)
+ PYTHON_MANDATORY = (_PHY_TYPE, True)
+ PDF_MANDATORY = (_PDF_TYPE, True)
+
+ SYSTEM_OPTIONAL = (_SYS_TYPE, False)
+ PYTHON_OPTIONAL = (_PHY_TYPE, False)
+ PDF_OPTIONAL = (_PDF_TYPE, True)
+
+ def __init__(self, pdf):
+ """
+ Initialize internal vars:
+
+ - missing: missing dependencies list, containing a distro-independent
+ name for a missing dependency and its type.
+ - missing_pkg: ancillary dict containing missing dependencies in
+ distro namespace, organized by type.
+ - need: total number of needed dependencies. Never cleaned.
+ - optional: total number of optional dependencies. Never cleaned.
+ - pdf: Is PDF support enabled?
+ """
+ self.missing = {}
+ self.missing_pkg = {}
+ self.need = 0
+ self.optional = 0
+ self.pdf = pdf
+
+ @staticmethod
+ def name(dtype):
+ """
+ Ancillary routine to output a warn/error message reporting
+ missing dependencies.
+ """
+ if dtype[0] == DepManager._SYS_TYPE:
+ msg = "build"
+ elif dtype[0] == DepManager._PHY_TYPE:
+ msg = "Python"
+ else:
+ msg = "PDF"
+
+ if dtype[1]:
+ return f"ERROR: {msg} mandatory deps missing"
+ else:
+ return f"Warning: {msg} optional deps missing"
+
+ @staticmethod
+ def is_optional(dtype):
+ """Ancillary routine to report if a dependency is optional"""
+ return not dtype[1]
+
+ @staticmethod
+ def is_pdf(dtype):
+ """Ancillary routine to report if a dependency is for PDF generation"""
+ if dtype[0] == DepManager._PDF_TYPE:
+ return True
+
+ return False
+
+ def add_package(self, package, dtype):
+ """
+ Add a package at the self.missing() dictionary.
+ Doesn't update missing_pkg.
+ """
+ is_optional = DepManager.is_optional(dtype)
+ self.missing[package] = dtype
+ if is_optional:
+ self.optional += 1
+ else:
+ self.need += 1
+
+ def del_package(self, package):
+ """
+ Remove a package at the self.missing() dictionary.
+ Doesn't update missing_pkg.
+ """
+ if package in self.missing:
+ del self.missing[package]
+
+ def clear_deps(self):
+ """
+ Clear dependencies without changing needed/optional.
+
+ This is an ackward way to have a separate section to recommend
+ a package after system main dependencies.
+
+ TODO: rework the logic to prevent needing it.
+ """
+
+ self.missing = {}
+ self.missing_pkg = {}
+
+ def check_missing(self, progs):
+ """
+ Update self.missing_pkg, using progs dict to convert from the
+ agnostic package name to distro-specific one.
+
+ Returns an string with the packages to be installed, sorted and
+ with eventual duplicates removed.
+ """
+
+ self.missing_pkg = {}
+
+ for prog, dtype in sorted(self.missing.items()):
+ # At least on some LTS distros like CentOS 7, texlive doesn't
+ # provide all packages we need. When such distros are
+ # detected, we have to disable PDF output.
+ #
+ # So, we need to ignore the packages that distros would
+ # need for LaTeX to work
+ if DepManager.is_pdf(dtype) and not self.pdf:
+ self.optional -= 1
+ continue
+
+ if not dtype in self.missing_pkg:
+ self.missing_pkg[dtype] = []
+
+ self.missing_pkg[dtype].append(progs.get(prog, prog))
+
+ install = []
+ for dtype, pkgs in self.missing_pkg.items():
+ install += pkgs
+
+ return " ".join(sorted(set(install)))
+
+ def warn_install(self):
+ """
+ Emit warnings/errors related to missing packages.
+ """
+
+ output_msg = ""
+
+ for dtype in sorted(self.missing_pkg.keys()):
+ progs = " ".join(sorted(set(self.missing_pkg[dtype])))
+
+ try:
+ name = DepManager.name(dtype)
+ output_msg += f'{name}:\t{progs}\n'
+ except KeyError:
+ raise KeyError(f"ERROR!!!: invalid dtype for {progs}: {dtype}")
+
+ if output_msg:
+ print(f"\n{output_msg}")
+
+class AncillaryMethods:
+ """
+ Ancillary methods that checks for missing dependencies for different
+ types of types, like binaries, python modules, rpm deps, etc.
+ """
+
+ @staticmethod
+ def which(prog):
+ """
+ Our own implementation of which(). We could instead use
+ shutil.which(), but this function is simple enough.
+ Probably faster to use this implementation than to import shutil.
+ """
+ for path in os.environ.get("PATH", "").split(":"):
+ full_path = os.path.join(path, prog)
+ if os.access(full_path, os.X_OK):
+ return full_path
+
+ return None
+
+ @staticmethod
+ def run(*args, **kwargs):
+ """
+ Excecute a command, hiding its output by default.
+ Preserve compatibility with older Python versions.
+ """
+
+ capture_output = kwargs.pop('capture_output', False)
+
+ if capture_output:
+ if 'stdout' not in kwargs:
+ kwargs['stdout'] = subprocess.PIPE
+ if 'stderr' not in kwargs:
+ kwargs['stderr'] = subprocess.PIPE
+ else:
+ if 'stdout' not in kwargs:
+ kwargs['stdout'] = subprocess.DEVNULL
+ if 'stderr' not in kwargs:
+ kwargs['stderr'] = subprocess.DEVNULL
+
+ # Don't break with older Python versions
+ if 'text' in kwargs and sys.version_info < (3, 7):
+ kwargs['universal_newlines'] = kwargs.pop('text')
+
+ return subprocess.run(*args, **kwargs)
+
+class MissingCheckers(AncillaryMethods):
+ """
+ Contains some ancillary checkers for different types of binaries and
+ package managers.
+ """
+
+ def __init__(self, args, texlive):
+ """
+ Initialize its internal variables
+ """
+ self.pdf = args.pdf
+ self.virtualenv = args.virtualenv
+ self.version_check = args.version_check
+ self.texlive = texlive
+
+ self.min_version = (0, 0, 0)
+ self.cur_version = (0, 0, 0)
+
+ self.deps = DepManager(self.pdf)
+
+ self.need_symlink = 0
+ self.need_sphinx = 0
+
+ self.verbose_warn_install = 1
+
+ self.virtenv_dir = ""
+ self.install = ""
+ self.python_cmd = ""
+
+ self.virtenv_prefix = ["sphinx_", "Sphinx_" ]
+
+ def check_missing_file(self, files, package, dtype):
+ """
+ Does the file exists? If not, add it to missing dependencies.
+ """
+ for f in files:
+ if os.path.exists(f):
+ return
+ self.deps.add_package(package, dtype)
+
+ def check_program(self, prog, dtype):
+ """
+ Does the program exists and it is at the PATH?
+ If not, add it to missing dependencies.
+ """
+ found = self.which(prog)
+ if found:
+ return found
+
+ self.deps.add_package(prog, dtype)
+
+ return None
+
+ def check_perl_module(self, prog, dtype):
+ """
+ Does perl have a dependency? Is it available?
+ If not, add it to missing dependencies.
+
+ Right now, we still need Perl for doc build, as it is required
+ by some tools called at docs or kernel build time, like:
+
+ tools/docs/documentation-file-ref-check
+
+ Also, checkpatch is on Perl.
+ """
+
+ # While testing with lxc download template, one of the
+ # distros (Oracle) didn't have perl - nor even an option to install
+ # before installing oraclelinux-release-el9 package.
+ #
+ # Check it before running an error. If perl is not there,
+ # add it as a mandatory package, as some parts of the doc builder
+ # needs it.
+ if not self.which("perl"):
+ self.deps.add_package("perl", DepManager.SYSTEM_MANDATORY)
+ self.deps.add_package(prog, dtype)
+ return
+
+ try:
+ self.run(["perl", f"-M{prog}", "-e", "1"], check=True)
+ except subprocess.CalledProcessError:
+ self.deps.add_package(prog, dtype)
+
+ def check_python_module(self, module, is_optional=False):
+ """
+ Does a python module exists outside venv? If not, add it to missing
+ dependencies.
+ """
+ if is_optional:
+ dtype = DepManager.PYTHON_OPTIONAL
+ else:
+ dtype = DepManager.PYTHON_MANDATORY
+
+ try:
+ self.run([self.python_cmd, "-c", f"import {module}"], check=True)
+ except subprocess.CalledProcessError:
+ self.deps.add_package(module, dtype)
+
+ def check_rpm_missing(self, pkgs, dtype):
+ """
+ Does a rpm package exists? If not, add it to missing dependencies.
+ """
+ for prog in pkgs:
+ try:
+ self.run(["rpm", "-q", prog], check=True)
+ except subprocess.CalledProcessError:
+ self.deps.add_package(prog, dtype)
+
+ def check_pacman_missing(self, pkgs, dtype):
+ """
+ Does a pacman package exists? If not, add it to missing dependencies.
+ """
+ for prog in pkgs:
+ try:
+ self.run(["pacman", "-Q", prog], check=True)
+ except subprocess.CalledProcessError:
+ self.deps.add_package(prog, dtype)
+
+ def check_missing_tex(self, is_optional=False):
+ """
+ Does a LaTeX package exists? If not, add it to missing dependencies.
+ """
+ if is_optional:
+ dtype = DepManager.PDF_OPTIONAL
+ else:
+ dtype = DepManager.PDF_MANDATORY
+
+ kpsewhich = self.which("kpsewhich")
+ for prog, package in self.texlive.items():
+
+ # If kpsewhich is not there, just add it to deps
+ if not kpsewhich:
+ self.deps.add_package(package, dtype)
+ continue
+
+ # Check if the package is needed
+ try:
+ result = self.run(
+ [kpsewhich, prog], stdout=subprocess.PIPE, text=True, check=True
+ )
+
+ # Didn't find. Add it
+ if not result.stdout.strip():
+ self.deps.add_package(package, dtype)
+
+ except subprocess.CalledProcessError:
+ # kpsewhich returned an error. Add it, just in case
+ self.deps.add_package(package, dtype)
+
+ def get_sphinx_fname(self):
+ """
+ Gets the binary filename for sphinx-build.
+ """
+ if "SPHINXBUILD" in os.environ:
+ return os.environ["SPHINXBUILD"]
+
+ fname = "sphinx-build"
+ if self.which(fname):
+ return fname
+
+ fname = "sphinx-build-3"
+ if self.which(fname):
+ self.need_symlink = 1
+ return fname
+
+ return ""
+
+ def get_sphinx_version(self, cmd):
+ """
+ Gets sphinx-build version.
+ """
+ env = os.environ.copy()
+
+ # The sphinx-build tool has a bug: internally, it tries to set
+ # locale with locale.setlocale(locale.LC_ALL, ''). This causes a
+ # crash if language is not set. Detect and fix it.
+ try:
+ locale.setlocale(locale.LC_ALL, '')
+ except Exception:
+ env["LC_ALL"] = "C"
+ env["LANG"] = "C"
+
+ try:
+ result = self.run([cmd, "--version"], env=env,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ text=True, check=True)
+ except (subprocess.CalledProcessError, FileNotFoundError):
+ return None
+
+ for line in result.stdout.split("\n"):
+ match = re.match(r"^sphinx-build\s+([\d\.]+)(?:\+(?:/[\da-f]+)|b\d+)?\s*$", line)
+ if match:
+ return PythonVersion.parse_version(match.group(1))
+
+ match = re.match(r"^Sphinx.*\s+([\d\.]+)\s*$", line)
+ if match:
+ return PythonVersion.parse_version(match.group(1))
+
+ def check_sphinx(self, conf):
+ """
+ Checks Sphinx minimal requirements
+ """
+ try:
+ with open(conf, "r", encoding="utf-8") as f:
+ for line in f:
+ match = re.match(r"^\s*needs_sphinx\s*=\s*[\'\"]([\d\.]+)[\'\"]", line)
+ if match:
+ self.min_version = PythonVersion.parse_version(match.group(1))
+ break
+ except IOError:
+ sys.exit(f"Can't open {conf}")
+
+ if not self.min_version:
+ sys.exit(f"Can't get needs_sphinx version from {conf}")
+
+ self.virtenv_dir = self.virtenv_prefix[0] + "latest"
+
+ sphinx = self.get_sphinx_fname()
+ if not sphinx:
+ self.need_sphinx = 1
+ return
+
+ self.cur_version = self.get_sphinx_version(sphinx)
+ if not self.cur_version:
+ sys.exit(f"{sphinx} didn't return its version")
+
+ if self.cur_version < self.min_version:
+ curver = PythonVersion.ver_str(self.cur_version)
+ minver = PythonVersion.ver_str(self.min_version)
+
+ print(f"ERROR: Sphinx version is {curver}. It should be >= {minver}")
+ self.need_sphinx = 1
+ return
+
+ # On version check mode, just assume Sphinx has all mandatory deps
+ if self.version_check and self.cur_version >= RECOMMENDED_VERSION:
+ sys.exit(0)
+
+ def catcheck(self, filename):
+ """
+ Reads a file if it exists, returning as string.
+ If not found, returns an empty string.
+ """
+ if os.path.exists(filename):
+ with open(filename, "r", encoding="utf-8") as f:
+ return f.read().strip()
+ return ""
+
+ def get_system_release(self):
+ """
+ Determine the system type. There's no unique way that would work
+ with all distros with a minimal package install. So, several
+ methods are used here.
+
+ By default, it will use lsb_release function. If not available, it will
+ fail back to reading the known different places where the distro name
+ is stored.
+
+ Several modern distros now have /etc/os-release, which usually have
+ a decent coverage.
+ """
+
+ system_release = ""
+
+ if self.which("lsb_release"):
+ result = self.run(["lsb_release", "-d"], capture_output=True, text=True)
+ system_release = result.stdout.replace("Description:", "").strip()
+
+ release_files = [
+ "/etc/system-release",
+ "/etc/redhat-release",
+ "/etc/lsb-release",
+ "/etc/gentoo-release",
+ ]
+
+ if not system_release:
+ for f in release_files:
+ system_release = self.catcheck(f)
+ if system_release:
+ break
+
+ # This seems more common than LSB these days
+ if not system_release:
+ os_var = {}
+ try:
+ with open("/etc/os-release", "r", encoding="utf-8") as f:
+ for line in f:
+ match = re.match(r"^([\w\d\_]+)=\"?([^\"]*)\"?\n", line)
+ if match:
+ os_var[match.group(1)] = match.group(2)
+
+ system_release = os_var.get("NAME", "")
+ if "VERSION_ID" in os_var:
+ system_release += " " + os_var["VERSION_ID"]
+ elif "VERSION" in os_var:
+ system_release += " " + os_var["VERSION"]
+ except IOError:
+ pass
+
+ if not system_release:
+ system_release = self.catcheck("/etc/issue")
+
+ system_release = system_release.strip()
+
+ return system_release
+
+class SphinxDependencyChecker(MissingCheckers):
+ """
+ Main class for checking Sphinx documentation build dependencies.
+
+ - Check for missing system packages;
+ - Check for missing Python modules;
+ - Check for missing LaTeX packages needed by PDF generation;
+ - Propose Sphinx install via Python Virtual environment;
+ - Propose Sphinx install via distro-specific package install.
+ """
+ def __init__(self, args):
+ """Initialize checker variables"""
+
+ # List of required texlive packages on Fedora and OpenSuse
+ texlive = {
+ "amsfonts.sty": "texlive-amsfonts",
+ "amsmath.sty": "texlive-amsmath",
+ "amssymb.sty": "texlive-amsfonts",
+ "amsthm.sty": "texlive-amscls",
+ "anyfontsize.sty": "texlive-anyfontsize",
+ "atbegshi.sty": "texlive-oberdiek",
+ "bm.sty": "texlive-tools",
+ "capt-of.sty": "texlive-capt-of",
+ "cmap.sty": "texlive-cmap",
+ "ctexhook.sty": "texlive-ctex",
+ "ecrm1000.tfm": "texlive-ec",
+ "eqparbox.sty": "texlive-eqparbox",
+ "eu1enc.def": "texlive-euenc",
+ "fancybox.sty": "texlive-fancybox",
+ "fancyvrb.sty": "texlive-fancyvrb",
+ "float.sty": "texlive-float",
+ "fncychap.sty": "texlive-fncychap",
+ "footnote.sty": "texlive-mdwtools",
+ "framed.sty": "texlive-framed",
+ "luatex85.sty": "texlive-luatex85",
+ "multirow.sty": "texlive-multirow",
+ "needspace.sty": "texlive-needspace",
+ "palatino.sty": "texlive-psnfss",
+ "parskip.sty": "texlive-parskip",
+ "polyglossia.sty": "texlive-polyglossia",
+ "tabulary.sty": "texlive-tabulary",
+ "threeparttable.sty": "texlive-threeparttable",
+ "titlesec.sty": "texlive-titlesec",
+ "ucs.sty": "texlive-ucs",
+ "upquote.sty": "texlive-upquote",
+ "wrapfig.sty": "texlive-wrapfig",
+ }
+
+ super().__init__(args, texlive)
+
+ self.need_pip = False
+ self.rec_sphinx_upgrade = 0
+
+ self.system_release = self.get_system_release()
+ self.activate_cmd = ""
+
+ # Some distros may not have a Sphinx shipped package compatible with
+ # our minimal requirements
+ self.package_supported = True
+
+ # Recommend a new python version
+ self.recommend_python = None
+
+ # Certain hints are meant to be shown only once
+ self.distro_msg = None
+
+ self.latest_avail_ver = (0, 0, 0)
+ self.venv_ver = (0, 0, 0)
+
+ prefix = os.environ.get("srctree", ".") + "/"
+
+ self.conf = prefix + "Documentation/conf.py"
+ self.requirement_file = prefix + "Documentation/sphinx/requirements.txt"
+
+ def get_install_progs(self, progs, cmd, extra=None):
+ """
+ Check for missing dependencies using the provided program mapping.
+
+ The actual distro-specific programs are mapped via progs argument.
+ """
+ install = self.deps.check_missing(progs)
+
+ if self.verbose_warn_install:
+ self.deps.warn_install()
+
+ if not install:
+ return
+
+ if cmd:
+ if self.verbose_warn_install:
+ msg = "You should run:"
+ else:
+ msg = ""
+
+ if extra:
+ msg += "\n\t" + extra.replace("\n", "\n\t")
+
+ return(msg + "\n\tsudo " + cmd + " " + install)
+
+ return None
+
+ #
+ # Distro-specific hints methods
+ #
+
+ def give_debian_hints(self):
+ """
+ Provide package installation hints for Debian-based distros.
+ """
+ progs = {
+ "Pod::Usage": "perl-modules",
+ "convert": "imagemagick",
+ "dot": "graphviz",
+ "ensurepip": "python3-venv",
+ "python-sphinx": "python3-sphinx",
+ "rsvg-convert": "librsvg2-bin",
+ "virtualenv": "virtualenv",
+ "xelatex": "texlive-xetex",
+ "yaml": "python3-yaml",
+ }
+
+ if self.pdf:
+ pdf_pkgs = {
+ "fonts-dejavu": [
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+ ],
+ "fonts-noto-cjk": [
+ "/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc",
+ "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
+ "/usr/share/fonts/opentype/noto/NotoSerifCJK-Regular.ttc",
+ ],
+ "tex-gyre": [
+ "/usr/share/texmf/tex/latex/tex-gyre/tgtermes.sty"
+ ],
+ "texlive-fonts-recommended": [
+ "/usr/share/texlive/texmf-dist/fonts/tfm/adobe/zapfding/pzdr.tfm",
+ ],
+ "texlive-lang-chinese": [
+ "/usr/share/texlive/texmf-dist/tex/latex/ctex/ctexhook.sty",
+ ],
+ }
+
+ for package, files in pdf_pkgs.items():
+ self.check_missing_file(files, package, DepManager.PDF_MANDATORY)
+
+ self.check_program("dvipng", DepManager.PDF_MANDATORY)
+
+ if not self.distro_msg:
+ self.distro_msg = \
+ "Note: ImageMagick is broken on some distros, affecting PDF output. For more details:\n" \
+ "\thttps://askubuntu.com/questions/1158894/imagemagick-still-broken-using-with-usr-bin-convert"
+
+ return self.get_install_progs(progs, "apt-get install")
+
+ def give_redhat_hints(self):
+ """
+ Provide package installation hints for RedHat-based distros
+ (Fedora, RHEL and RHEL-based variants).
+ """
+ progs = {
+ "Pod::Usage": "perl-Pod-Usage",
+ "convert": "ImageMagick",
+ "dot": "graphviz",
+ "python-sphinx": "python3-sphinx",
+ "rsvg-convert": "librsvg2-tools",
+ "virtualenv": "python3-virtualenv",
+ "xelatex": "texlive-xetex-bin",
+ "yaml": "python3-pyyaml",
+ }
+
+ fedora_tex_pkgs = [
+ "dejavu-sans-fonts",
+ "dejavu-sans-mono-fonts",
+ "dejavu-serif-fonts",
+ "texlive-collection-fontsrecommended",
+ "texlive-collection-latex",
+ "texlive-xecjk",
+ ]
+
+ fedora = False
+ rel = None
+
+ match = re.search(r"(release|Linux)\s+(\d+)", self.system_release)
+ if match:
+ rel = int(match.group(2))
+
+ if not rel:
+ print("Couldn't identify release number")
+ noto_sans_redhat = None
+ self.pdf = False
+ elif re.search("Fedora", self.system_release):
+ # Fedora 38 and upper use this CJK font
+
+ noto_sans_redhat = "google-noto-sans-cjk-fonts"
+ fedora = True
+ else:
+ # Almalinux, CentOS, RHEL, ...
+
+ # at least up to version 9 (and Fedora < 38), that's the CJK font
+ noto_sans_redhat = "google-noto-sans-cjk-ttc-fonts"
+
+ progs["virtualenv"] = "python-virtualenv"
+
+ if not rel or rel < 8:
+ print("ERROR: Distro not supported. Too old?")
+ return
+
+ # RHEL 8 uses Python 3.6, which is not compatible with
+ # the build system anymore. Suggest Python 3.11
+ if rel == 8:
+ self.check_program("python3.9", DepManager.SYSTEM_MANDATORY)
+ progs["python3.9"] = "python39"
+ progs["yaml"] = "python39-pyyaml"
+
+ self.recommend_python = True
+
+ # There's no python39-sphinx package. Only pip is supported
+ self.package_supported = False
+
+ if not self.distro_msg:
+ self.distro_msg = \
+ "Note: RHEL-based distros typically require extra repositories.\n" \
+ "For most, enabling epel and crb are enough:\n" \
+ "\tsudo dnf install -y epel-release\n" \
+ "\tsudo dnf config-manager --set-enabled crb\n" \
+ "Yet, some may have other required repositories. Those commands could be useful:\n" \
+ "\tsudo dnf repolist all\n" \
+ "\tsudo dnf repoquery --available --info <pkgs>\n" \
+ "\tsudo dnf config-manager --set-enabled '*' # enable all - probably not what you want"
+
+ if self.pdf:
+ pdf_pkgs = [
+ "/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc",
+ "/usr/share/fonts/google-noto-sans-cjk-fonts/NotoSansCJK-Regular.ttc",
+ ]
+
+ self.check_missing_file(pdf_pkgs, noto_sans_redhat, DepManager.PDF_MANDATORY)
+
+ self.check_rpm_missing(fedora_tex_pkgs, DepManager.PDF_MANDATORY)
+
+ self.check_missing_tex(DepManager.PDF_MANDATORY)
+
+ # There's no texlive-ctex on RHEL 8 repositories. This will
+ # likely affect CJK pdf build only.
+ if not fedora and rel == 8:
+ self.deps.del_package("texlive-ctex")
+
+ return self.get_install_progs(progs, "dnf install")
+
+ def give_opensuse_hints(self):
+ """
+ Provide package installation hints for openSUSE-based distros
+ (Leap and Tumbleweed).
+ """
+ progs = {
+ "Pod::Usage": "perl-Pod-Usage",
+ "convert": "ImageMagick",
+ "dot": "graphviz",
+ "python-sphinx": "python3-sphinx",
+ "virtualenv": "python3-virtualenv",
+ "xelatex": "texlive-xetex-bin texlive-dejavu",
+ "yaml": "python3-pyyaml",
+ }
+
+ suse_tex_pkgs = [
+ "texlive-babel-english",
+ "texlive-caption",
+ "texlive-colortbl",
+ "texlive-courier",
+ "texlive-dvips",
+ "texlive-helvetic",
+ "texlive-makeindex",
+ "texlive-metafont",
+ "texlive-metapost",
+ "texlive-palatino",
+ "texlive-preview",
+ "texlive-times",
+ "texlive-zapfchan",
+ "texlive-zapfding",
+ ]
+
+ progs["latexmk"] = "texlive-latexmk-bin"
+
+ match = re.search(r"(Leap)\s+(\d+).(\d)", self.system_release)
+ if match:
+ rel = int(match.group(2))
+
+ # Leap 15.x uses Python 3.6, which is not compatible with
+ # the build system anymore. Suggest Python 3.11
+ if rel == 15:
+ if not self.which(self.python_cmd):
+ self.check_program("python3.11", DepManager.SYSTEM_MANDATORY)
+ progs["python3.11"] = "python311"
+ self.recommend_python = True
+
+ progs.update({
+ "python-sphinx": "python311-Sphinx python311-Sphinx-latex",
+ "virtualenv": "python311-virtualenv",
+ "yaml": "python311-PyYAML",
+ })
+ else:
+ # Tumbleweed defaults to Python 3.11
+
+ progs.update({
+ "python-sphinx": "python313-Sphinx python313-Sphinx-latex",
+ "virtualenv": "python313-virtualenv",
+ "yaml": "python313-PyYAML",
+ })
+
+ # FIXME: add support for installing CJK fonts
+ #
+ # I tried hard, but was unable to find a way to install
+ # "Noto Sans CJK SC" on openSUSE
+
+ if self.pdf:
+ self.check_rpm_missing(suse_tex_pkgs, DepManager.PDF_MANDATORY)
+ if self.pdf:
+ self.check_missing_tex()
+
+ return self.get_install_progs(progs, "zypper install --no-recommends")
+
+ def give_mageia_hints(self):
+ """
+ Provide package installation hints for Mageia and OpenMandriva.
+ """
+ progs = {
+ "Pod::Usage": "perl-Pod-Usage",
+ "convert": "ImageMagick",
+ "dot": "graphviz",
+ "python-sphinx": "python3-sphinx",
+ "rsvg-convert": "librsvg2",
+ "virtualenv": "python3-virtualenv",
+ "xelatex": "texlive",
+ "yaml": "python3-yaml",
+ }
+
+ tex_pkgs = [
+ "texlive-fontsextra",
+ "texlive-fonts-asian",
+ "fonts-ttf-dejavu",
+ ]
+
+ if re.search(r"OpenMandriva", self.system_release):
+ packager_cmd = "dnf install"
+ noto_sans = "noto-sans-cjk-fonts"
+ tex_pkgs = [
+ "texlive-collection-basic",
+ "texlive-collection-langcjk",
+ "texlive-collection-fontsextra",
+ "texlive-collection-fontsrecommended"
+ ]
+
+ # Tested on OpenMandriva Lx 4.3
+ progs["convert"] = "imagemagick"
+ progs["yaml"] = "python-pyyaml"
+ progs["python-virtualenv"] = "python-virtualenv"
+ progs["python-sphinx"] = "python-sphinx"
+ progs["xelatex"] = "texlive"
+
+ self.check_program("python-virtualenv", DepManager.PYTHON_MANDATORY)
+
+ # On my tests with openMandriva LX 4.0 docker image, upgraded
+ # to 4.3, python-virtualenv package is broken: it is missing
+ # ensurepip. Without it, the alternative would be to run:
+ # python3 -m venv --without-pip ~/sphinx_latest, but running
+ # pip there won't install sphinx at venv.
+ #
+ # Add a note about that.
+
+ if not self.distro_msg:
+ self.distro_msg = \
+ "Notes:\n"\
+ "1. for venv, ensurepip could be broken, preventing its install method.\n" \
+ "2. at least on OpenMandriva LX 4.3, texlive packages seem broken"
+
+ else:
+ packager_cmd = "urpmi"
+ noto_sans = "google-noto-sans-cjk-ttc-fonts"
+
+ progs["latexmk"] = "texlive-collection-basic"
+
+ if self.pdf:
+ pdf_pkgs = [
+ "/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc",
+ "/usr/share/fonts/TTF/NotoSans-Regular.ttf",
+ ]
+
+ self.check_missing_file(pdf_pkgs, noto_sans, DepManager.PDF_MANDATORY)
+ self.check_rpm_missing(tex_pkgs, DepManager.PDF_MANDATORY)
+
+ return self.get_install_progs(progs, packager_cmd)
+
+ def give_arch_linux_hints(self):
+ """
+ Provide package installation hints for ArchLinux.
+ """
+ progs = {
+ "convert": "imagemagick",
+ "dot": "graphviz",
+ "latexmk": "texlive-core",
+ "rsvg-convert": "extra/librsvg",
+ "virtualenv": "python-virtualenv",
+ "xelatex": "texlive-xetex",
+ "yaml": "python-yaml",
+ }
+
+ archlinux_tex_pkgs = [
+ "texlive-basic",
+ "texlive-binextra",
+ "texlive-core",
+ "texlive-fontsrecommended",
+ "texlive-langchinese",
+ "texlive-langcjk",
+ "texlive-latexextra",
+ "ttf-dejavu",
+ ]
+
+ if self.pdf:
+ self.check_pacman_missing(archlinux_tex_pkgs,
+ DepManager.PDF_MANDATORY)
+
+ self.check_missing_file(["/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"],
+ "noto-fonts-cjk",
+ DepManager.PDF_MANDATORY)
+
+
+ return self.get_install_progs(progs, "pacman -S")
+
+ def give_gentoo_hints(self):
+ """
+ Provide package installation hints for Gentoo.
+ """
+ texlive_deps = [
+ "dev-texlive/texlive-fontsrecommended",
+ "dev-texlive/texlive-latexextra",
+ "dev-texlive/texlive-xetex",
+ "media-fonts/dejavu",
+ ]
+
+ progs = {
+ "convert": "media-gfx/imagemagick",
+ "dot": "media-gfx/graphviz",
+ "rsvg-convert": "gnome-base/librsvg",
+ "virtualenv": "dev-python/virtualenv",
+ "xelatex": " ".join(texlive_deps),
+ "yaml": "dev-python/pyyaml",
+ "python-sphinx": "dev-python/sphinx",
+ }
+
+ if self.pdf:
+ pdf_pkgs = {
+ "media-fonts/dejavu": [
+ "/usr/share/fonts/dejavu/DejaVuSans.ttf",
+ ],
+ "media-fonts/noto-cjk": [
+ "/usr/share/fonts/noto-cjk/NotoSansCJKsc-Regular.otf",
+ "/usr/share/fonts/noto-cjk/NotoSerifCJK-Regular.ttc",
+ ],
+ }
+ for package, files in pdf_pkgs.items():
+ self.check_missing_file(files, package, DepManager.PDF_MANDATORY)
+
+ # Handling dependencies is a nightmare, as Gentoo refuses to emerge
+ # some packages if there's no package.use file describing them.
+ # To make it worse, compilation flags shall also be present there
+ # for some packages. If USE is not perfect, error/warning messages
+ # like those are shown:
+ #
+ # !!! The following binary packages have been ignored due to non matching USE:
+ #
+ # =media-gfx/graphviz-12.2.1-r1 X pdf -python_single_target_python3_13 qt6 svg
+ # =media-gfx/graphviz-12.2.1-r1 X pdf python_single_target_python3_12 -python_single_target_python3_13 qt6 svg
+ # =media-gfx/graphviz-12.2.1-r1 X pdf qt6 svg
+ # =media-gfx/graphviz-12.2.1-r1 X pdf -python_single_target_python3_10 qt6 svg
+ # =media-gfx/graphviz-12.2.1-r1 X pdf -python_single_target_python3_10 python_single_target_python3_12 -python_single_target_python3_13 qt6 svg
+ # =media-fonts/noto-cjk-20190416 X
+ # =app-text/texlive-core-2024-r1 X cjk -xetex
+ # =app-text/texlive-core-2024-r1 X -xetex
+ # =app-text/texlive-core-2024-r1 -xetex
+ # =dev-libs/zziplib-0.13.79-r1 sdl
+ #
+ # And will ignore such packages, installing the remaining ones. That
+ # affects mostly the image extension and PDF generation.
+
+ # Package dependencies and the minimal needed args:
+ portages = {
+ "graphviz": "media-gfx/graphviz",
+ "imagemagick": "media-gfx/imagemagick",
+ "media-libs": "media-libs/harfbuzz icu",
+ "media-fonts": "media-fonts/noto-cjk",
+ "texlive": "app-text/texlive-core xetex",
+ "zziblib": "dev-libs/zziplib sdl",
+ }
+
+ extra_cmds = ""
+ if not self.distro_msg:
+ self.distro_msg = "Note: Gentoo requires package.use to be adjusted before emerging packages"
+
+ use_base = "/etc/portage/package.use"
+ files = glob(f"{use_base}/*")
+
+ for fname, portage in portages.items():
+ install = False
+
+ while install is False:
+ if not files:
+ # No files under package.usage. Install all
+ install = True
+ break
+
+ args = portage.split(" ")
+
+ name = args.pop(0)
+
+ cmd = ["grep", "-l", "-E", rf"^{name}\b" ] + files
+ result = self.run(cmd, stdout=subprocess.PIPE, text=True)
+ if result.returncode or not result.stdout.strip():
+ # File containing portage name not found
+ install = True
+ break
+
+ # Ensure that needed USE flags are present
+ if args:
+ match_fname = result.stdout.strip()
+ with open(match_fname, 'r', encoding='utf8',
+ errors='backslashreplace') as fp:
+ for line in fp:
+ for arg in args:
+ if arg.startswith("-"):
+ continue
+
+ if not re.search(rf"\s*{arg}\b", line):
+ # Needed file argument not found
+ install = True
+ break
+
+ # Everything looks ok, don't install
+ break
+
+ # emit a code to setup missing USE
+ if install:
+ extra_cmds += (f"sudo su -c 'echo \"{portage}\" > {use_base}/{fname}'\n")
+
+ # Now, we can use emerge and let it respect USE
+ return self.get_install_progs(progs,
+ "emerge --ask --changed-use --binpkg-respect-use=y",
+ extra_cmds)
+
+ def get_install(self):
+ """
+ OS-specific hints logic. Seeks for a hinter. If found, use it to
+ provide package-manager specific install commands.
+
+ Otherwise, outputs install instructions for the meta-packages.
+
+ Returns a string with the command to be executed to install the
+ the needed packages, if distro found. Otherwise, return just a
+ list of packages that require installation.
+ """
+ os_hints = {
+ re.compile("Red Hat Enterprise Linux"): self.give_redhat_hints,
+ re.compile("Fedora"): self.give_redhat_hints,
+ re.compile("AlmaLinux"): self.give_redhat_hints,
+ re.compile("Amazon Linux"): self.give_redhat_hints,
+ re.compile("CentOS"): self.give_redhat_hints,
+ re.compile("openEuler"): self.give_redhat_hints,
+ re.compile("Oracle Linux Server"): self.give_redhat_hints,
+ re.compile("Rocky Linux"): self.give_redhat_hints,
+ re.compile("Springdale Open Enterprise"): self.give_redhat_hints,
+
+ re.compile("Ubuntu"): self.give_debian_hints,
+ re.compile("Debian"): self.give_debian_hints,
+ re.compile("Devuan"): self.give_debian_hints,
+ re.compile("Kali"): self.give_debian_hints,
+ re.compile("Mint"): self.give_debian_hints,
+
+ re.compile("openSUSE"): self.give_opensuse_hints,
+
+ re.compile("Mageia"): self.give_mageia_hints,
+ re.compile("OpenMandriva"): self.give_mageia_hints,
+
+ re.compile("Arch Linux"): self.give_arch_linux_hints,
+ re.compile("Gentoo"): self.give_gentoo_hints,
+ }
+
+ # If the OS is detected, use per-OS hint logic
+ for regex, os_hint in os_hints.items():
+ if regex.search(self.system_release):
+ return os_hint()
+
+ #
+ # Fall-back to generic hint code for other distros
+ # That's far from ideal, specially for LaTeX dependencies.
+ #
+ progs = {"sphinx-build": "sphinx"}
+ if self.pdf:
+ self.check_missing_tex()
+
+ self.distro_msg = \
+ f"I don't know distro {self.system_release}.\n" \
+ "So, I can't provide you a hint with the install procedure.\n" \
+ "There are likely missing dependencies."
+
+ return self.get_install_progs(progs, None)
+
+ #
+ # Common dependencies
+ #
+ def deactivate_help(self):
+ """
+ Print a helper message to disable a virtual environment.
+ """
+
+ print("\n If you want to exit the virtualenv, you can use:")
+ print("\tdeactivate")
+
+ def get_virtenv(self):
+ """
+ Give a hint about how to activate an already-existing virtual
+ environment containing sphinx-build.
+
+ Returns a tuble with (activate_cmd_path, sphinx_version) with
+ the newest available virtual env.
+ """
+
+ cwd = os.getcwd()
+
+ activates = []
+
+ # Add all sphinx prefixes with possible version numbers
+ for p in self.virtenv_prefix:
+ activates += glob(f"{cwd}/{p}[0-9]*/bin/activate")
+
+ activates.sort(reverse=True, key=str.lower)
+
+ # Place sphinx_latest first, if it exists
+ for p in self.virtenv_prefix:
+ activates = glob(f"{cwd}/{p}*latest/bin/activate") + activates
+
+ ver = (0, 0, 0)
+ for f in activates:
+ # Discard too old Sphinx virtual environments
+ match = re.search(r"(\d+)\.(\d+)\.(\d+)", f)
+ if match:
+ ver = (int(match.group(1)), int(match.group(2)), int(match.group(3)))
+
+ if ver < self.min_version:
+ continue
+
+ sphinx_cmd = f.replace("activate", "sphinx-build")
+ if not os.path.isfile(sphinx_cmd):
+ continue
+
+ ver = self.get_sphinx_version(sphinx_cmd)
+
+ if not ver:
+ venv_dir = f.replace("/bin/activate", "")
+ print(f"Warning: virtual environment {venv_dir} is not working.\n" \
+ "Python version upgrade? Remove it with:\n\n" \
+ "\trm -rf {venv_dir}\n\n")
+ else:
+ if self.need_sphinx and ver >= self.min_version:
+ return (f, ver)
+ elif PythonVersion.parse_version(ver) > self.cur_version:
+ return (f, ver)
+
+ return ("", ver)
+
+ def recommend_sphinx_upgrade(self):
+ """
+ Check if Sphinx needs to be upgraded.
+
+ Returns a tuple with the higest available Sphinx version if found.
+ Otherwise, returns None to indicate either that no upgrade is needed
+ or no venv was found.
+ """
+
+ # Avoid running sphinx-builds from venv if cur_version is good
+ if self.cur_version and self.cur_version >= RECOMMENDED_VERSION:
+ self.latest_avail_ver = self.cur_version
+ return None
+
+ # Get the highest version from sphinx_*/bin/sphinx-build and the
+ # corresponding command to activate the venv/virtenv
+ self.activate_cmd, self.venv_ver = self.get_virtenv()
+
+ # Store the highest version from Sphinx existing virtualenvs
+ if self.activate_cmd and self.venv_ver > self.cur_version:
+ self.latest_avail_ver = self.venv_ver
+ else:
+ if self.cur_version:
+ self.latest_avail_ver = self.cur_version
+ else:
+ self.latest_avail_ver = (0, 0, 0)
+
+ # As we don't know package version of Sphinx, and there's no
+ # virtual environments, don't check if upgrades are needed
+ if not self.virtualenv:
+ if not self.latest_avail_ver:
+ return None
+
+ return self.latest_avail_ver
+
+ # Either there are already a virtual env or a new one should be created
+ self.need_pip = True
+
+ if not self.latest_avail_ver:
+ return None
+
+ # Return if the reason is due to an upgrade or not
+ if self.latest_avail_ver != (0, 0, 0):
+ if self.latest_avail_ver < RECOMMENDED_VERSION:
+ self.rec_sphinx_upgrade = 1
+
+ return self.latest_avail_ver
+
+ def recommend_package(self):
+ """
+ Recommend installing Sphinx as a distro-specific package.
+ """
+
+ print("\n2) As a package with:")
+
+ old_need = self.deps.need
+ old_optional = self.deps.optional
+
+ self.pdf = False
+ self.deps.optional = 0
+ old_verbose = self.verbose_warn_install
+ self.verbose_warn_install = 0
+
+ self.deps.clear_deps()
+
+ self.deps.add_package("python-sphinx", DepManager.PYTHON_MANDATORY)
+
+ cmd = self.get_install()
+ if cmd:
+ print(cmd)
+
+ self.deps.need = old_need
+ self.deps.optional = old_optional
+ self.verbose_warn_install = old_verbose
+
+ def recommend_sphinx_version(self, virtualenv_cmd):
+ """
+ Provide recommendations for installing or upgrading Sphinx based
+ on current version.
+
+ The logic here is complex, as it have to deal with different versions:
+
+ - minimal supported version;
+ - minimal PDF version;
+ - recommended version.
+
+ It also needs to work fine with both distro's package and
+ venv/virtualenv
+ """
+
+ if self.recommend_python:
+ cur_ver = sys.version_info[:3]
+ if cur_ver < MIN_PYTHON_VERSION:
+ print(f"\nPython version {cur_ver} is incompatible with doc build.\n" \
+ "Please upgrade it and re-run.\n")
+ return
+
+ # Version is OK. Nothing to do.
+ if self.cur_version != (0, 0, 0) and self.cur_version >= RECOMMENDED_VERSION:
+ return
+
+ if self.latest_avail_ver:
+ latest_avail_ver = PythonVersion.ver_str(self.latest_avail_ver)
+
+ if not self.need_sphinx:
+ # sphinx-build is present and its version is >= $min_version
+
+ # only recommend enabling a newer virtenv version if makes sense.
+ if self.latest_avail_ver and self.latest_avail_ver > self.cur_version:
+ print(f"\nYou may also use the newer Sphinx version {latest_avail_ver} with:")
+ if f"{self.virtenv_prefix}" in os.getcwd():
+ print("\tdeactivate")
+ print(f"\t. {self.activate_cmd}")
+ self.deactivate_help()
+ return
+
+ if self.latest_avail_ver and self.latest_avail_ver >= RECOMMENDED_VERSION:
+ return
+
+ if not self.virtualenv:
+ # No sphinx either via package or via virtenv. As we can't
+ # Compare the versions here, just return, recommending the
+ # user to install it from the package distro.
+ if not self.latest_avail_ver or self.latest_avail_ver == (0, 0, 0):
+ return
+
+ # User doesn't want a virtenv recommendation, but he already
+ # installed one via virtenv with a newer version.
+ # So, print commands to enable it
+ if self.latest_avail_ver > self.cur_version:
+ print(f"\nYou may also use the Sphinx virtualenv version {latest_avail_ver} with:")
+ if f"{self.virtenv_prefix}" in os.getcwd():
+ print("\tdeactivate")
+ print(f"\t. {self.activate_cmd}")
+ self.deactivate_help()
+ return
+ print("\n")
+ else:
+ if self.need_sphinx:
+ self.deps.need += 1
+
+ # Suggest newer versions if current ones are too old
+ if self.latest_avail_ver and self.latest_avail_ver >= self.min_version:
+ if self.latest_avail_ver >= RECOMMENDED_VERSION:
+ print(f"\nNeed to activate Sphinx (version {latest_avail_ver}) on virtualenv with:")
+ print(f"\t. {self.activate_cmd}")
+ self.deactivate_help()
+ return
+
+ # Version is above the minimal required one, but may be
+ # below the recommended one. So, print warnings/notes
+ if self.latest_avail_ver < RECOMMENDED_VERSION:
+ print(f"Warning: It is recommended at least Sphinx version {RECOMMENDED_VERSION}.")
+
+ # At this point, either it needs Sphinx or upgrade is recommended,
+ # both via pip
+
+ if self.rec_sphinx_upgrade:
+ if not self.virtualenv:
+ print("Instead of install/upgrade Python Sphinx pkg, you could use pip/pypi with:\n\n")
+ else:
+ print("To upgrade Sphinx, use:\n\n")
+ else:
+ print("\nSphinx needs to be installed either:\n1) via pip/pypi with:\n")
+
+ if not virtualenv_cmd:
+ print(" Currently not possible.\n")
+ print(" Please upgrade Python to a newer version and run this script again")
+ else:
+ print(f"\t{virtualenv_cmd} {self.virtenv_dir}")
+ print(f"\t. {self.virtenv_dir}/bin/activate")
+ print(f"\tpip install -r {self.requirement_file}")
+ self.deactivate_help()
+
+ if self.package_supported:
+ self.recommend_package()
+
+ print("\n" \
+ " Please note that Sphinx currentlys produce false-positive\n" \
+ " warnings when the same name is used for more than one type (functions,\n" \
+ " structs, enums,...). This is known Sphinx bug. For more details, see:\n" \
+ "\thttps://github.com/sphinx-doc/sphinx/pull/8313")
+
+ def check_needs(self):
+ """
+ Main method that checks needed dependencies and provides
+ recommendations.
+ """
+ self.python_cmd = sys.executable
+
+ # Check if Sphinx is already accessible from current environment
+ self.check_sphinx(self.conf)
+
+ if self.system_release:
+ print(f"Detected OS: {self.system_release}.")
+ else:
+ print("Unknown OS")
+ if self.cur_version != (0, 0, 0):
+ ver = PythonVersion.ver_str(self.cur_version)
+ print(f"Sphinx version: {ver}\n")
+
+ # Check the type of virtual env, depending on Python version
+ virtualenv_cmd = None
+
+ if sys.version_info < MIN_PYTHON_VERSION:
+ min_ver = ver_str(MIN_PYTHON_VERSION)
+ print(f"ERROR: at least python {min_ver} is required to build the kernel docs")
+ self.need_sphinx = 1
+
+ self.venv_ver = self.recommend_sphinx_upgrade()
+
+ if self.need_pip:
+ if sys.version_info < MIN_PYTHON_VERSION:
+ self.need_pip = False
+ print("Warning: python version is not supported.")
+ else:
+ virtualenv_cmd = f"{self.python_cmd} -m venv"
+ self.check_python_module("ensurepip")
+
+ # Check for needed programs/tools
+ self.check_perl_module("Pod::Usage", DepManager.SYSTEM_MANDATORY)
+
+ self.check_program("make", DepManager.SYSTEM_MANDATORY)
+ self.check_program("which", DepManager.SYSTEM_MANDATORY)
+
+ self.check_program("dot", DepManager.SYSTEM_OPTIONAL)
+ self.check_program("convert", DepManager.SYSTEM_OPTIONAL)
+
+ self.check_python_module("yaml")
+
+ if self.pdf:
+ self.check_program("xelatex", DepManager.PDF_MANDATORY)
+ self.check_program("rsvg-convert", DepManager.PDF_MANDATORY)
+ self.check_program("latexmk", DepManager.PDF_MANDATORY)
+
+ # Do distro-specific checks and output distro-install commands
+ cmd = self.get_install()
+ if cmd:
+ print(cmd)
+
+ # If distro requires some special instructions, print here.
+ # Please notice that get_install() needs to be called first.
+ if self.distro_msg:
+ print("\n" + self.distro_msg)
+
+ if not self.python_cmd:
+ if self.need == 1:
+ sys.exit("Can't build as 1 mandatory dependency is missing")
+ elif self.need:
+ sys.exit(f"Can't build as {self.need} mandatory dependencies are missing")
+
+ # Check if sphinx-build is called sphinx-build-3
+ if self.need_symlink:
+ sphinx_path = self.which("sphinx-build-3")
+ if sphinx_path:
+ print(f"\tsudo ln -sf {sphinx_path} /usr/bin/sphinx-build\n")
+
+ self.recommend_sphinx_version(virtualenv_cmd)
+ print("")
+
+ if not self.deps.optional:
+ print("All optional dependencies are met.")
+
+ if self.deps.need == 1:
+ sys.exit("Can't build as 1 mandatory dependency is missing")
+ elif self.deps.need:
+ sys.exit(f"Can't build as {self.deps.need} mandatory dependencies are missing")
+
+ print("Needed package dependencies are met.")
+
+DESCRIPTION = """
+Process some flags related to Sphinx installation and documentation build.
+"""
+
+
+def main():
+ """Main function"""
+ parser = argparse.ArgumentParser(description=DESCRIPTION)
+
+ parser.add_argument(
+ "--no-virtualenv",
+ action="store_false",
+ dest="virtualenv",
+ help="Recommend installing Sphinx instead of using a virtualenv",
+ )
+
+ parser.add_argument(
+ "--no-pdf",
+ action="store_false",
+ dest="pdf",
+ help="Don't check for dependencies required to build PDF docs",
+ )
+
+ parser.add_argument(
+ "--version-check",
+ action="store_true",
+ dest="version_check",
+ help="If version is compatible, don't check for missing dependencies",
+ )
+
+ args = parser.parse_args()
+
+ checker = SphinxDependencyChecker(args)
+
+ PythonVersion.check_python(MIN_PYTHON_VERSION,
+ bail_out=True, success_on_error=True)
+ checker.check_needs()
+
+# Call main if not used as module
+if __name__ == "__main__":
+ main()
diff --git a/tools/docs/test_doc_build.py b/tools/docs/test_doc_build.py
new file mode 100755
index 000000000000..47b4606569f9
--- /dev/null
+++ b/tools/docs/test_doc_build.py
@@ -0,0 +1,513 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+#
+# pylint: disable=R0903,R0912,R0913,R0914,R0917,C0301
+
+"""
+Install minimal supported requirements for different Sphinx versions
+and optionally test the build.
+"""
+
+import argparse
+import asyncio
+import os.path
+import shutil
+import sys
+import time
+import subprocess
+
+# Minimal python version supported by the building system.
+
+PYTHON = os.path.basename(sys.executable)
+
+min_python_bin = None
+
+for i in range(9, 13):
+ p = f"python3.{i}"
+ if shutil.which(p):
+ min_python_bin = p
+ break
+
+if not min_python_bin:
+ min_python_bin = PYTHON
+
+# Starting from 8.0, Python 3.9 is not supported anymore.
+PYTHON_VER_CHANGES = {(8, 0, 0): PYTHON}
+
+DEFAULT_VERSIONS_TO_TEST = [
+ (3, 4, 3), # Minimal supported version
+ (5, 3, 0), # CentOS Stream 9 / AlmaLinux 9
+ (6, 1, 1), # Debian 12
+ (7, 2, 1), # openSUSE Leap 15.6
+ (7, 2, 6), # Ubuntu 24.04 LTS
+ (7, 4, 7), # Ubuntu 24.10
+ (7, 3, 0), # openSUSE Tumbleweed
+ (8, 1, 3), # Fedora 42
+ (8, 2, 3) # Latest version - covers rolling distros
+]
+
+# Sphinx versions to be installed and their incremental requirements
+SPHINX_REQUIREMENTS = {
+ # Oldest versions we support for each package required by Sphinx 3.4.3
+ (3, 4, 3): {
+ "docutils": "0.16",
+ "alabaster": "0.7.12",
+ "babel": "2.8.0",
+ "certifi": "2020.6.20",
+ "docutils": "0.16",
+ "idna": "2.10",
+ "imagesize": "1.2.0",
+ "Jinja2": "2.11.2",
+ "MarkupSafe": "1.1.1",
+ "packaging": "20.4",
+ "Pygments": "2.6.1",
+ "PyYAML": "5.1",
+ "requests": "2.24.0",
+ "snowballstemmer": "2.0.0",
+ "sphinxcontrib-applehelp": "1.0.2",
+ "sphinxcontrib-devhelp": "1.0.2",
+ "sphinxcontrib-htmlhelp": "1.0.3",
+ "sphinxcontrib-jsmath": "1.0.1",
+ "sphinxcontrib-qthelp": "1.0.3",
+ "sphinxcontrib-serializinghtml": "1.1.4",
+ "urllib3": "1.25.9",
+ },
+
+ # Update package dependencies to a more modern base. The goal here
+ # is to avoid to many incremental changes for the next entries
+ (3, 5, 0): {
+ "alabaster": "0.7.13",
+ "babel": "2.17.0",
+ "certifi": "2025.6.15",
+ "idna": "3.10",
+ "imagesize": "1.4.1",
+ "packaging": "25.0",
+ "Pygments": "2.8.1",
+ "requests": "2.32.4",
+ "snowballstemmer": "3.0.1",
+ "sphinxcontrib-applehelp": "1.0.4",
+ "sphinxcontrib-htmlhelp": "2.0.1",
+ "sphinxcontrib-serializinghtml": "1.1.5",
+ "urllib3": "2.0.0",
+ },
+
+ # Starting from here, ensure all docutils versions are covered with
+ # supported Sphinx versions. Other packages are upgraded only when
+ # required by pip
+ (4, 0, 0): {
+ "PyYAML": "5.1",
+ },
+ (4, 1, 0): {
+ "docutils": "0.17",
+ "Pygments": "2.19.1",
+ "Jinja2": "3.0.3",
+ "MarkupSafe": "2.0",
+ },
+ (4, 3, 0): {},
+ (4, 4, 0): {},
+ (4, 5, 0): {
+ "docutils": "0.17.1",
+ },
+ (5, 0, 0): {},
+ (5, 1, 0): {},
+ (5, 2, 0): {
+ "docutils": "0.18",
+ "Jinja2": "3.1.2",
+ "MarkupSafe": "2.0",
+ "PyYAML": "5.3.1",
+ },
+ (5, 3, 0): {
+ "docutils": "0.18.1",
+ },
+ (6, 0, 0): {},
+ (6, 1, 0): {},
+ (6, 2, 0): {
+ "PyYAML": "5.4.1",
+ },
+ (7, 0, 0): {},
+ (7, 1, 0): {},
+ (7, 2, 0): {
+ "docutils": "0.19",
+ "PyYAML": "6.0.1",
+ "sphinxcontrib-serializinghtml": "1.1.9",
+ },
+ (7, 2, 6): {
+ "docutils": "0.20",
+ },
+ (7, 3, 0): {
+ "alabaster": "0.7.14",
+ "PyYAML": "6.0.1",
+ "tomli": "2.0.1",
+ },
+ (7, 4, 0): {
+ "docutils": "0.20.1",
+ "PyYAML": "6.0.1",
+ },
+ (8, 0, 0): {
+ "docutils": "0.21",
+ },
+ (8, 1, 0): {
+ "docutils": "0.21.1",
+ "PyYAML": "6.0.1",
+ "sphinxcontrib-applehelp": "1.0.7",
+ "sphinxcontrib-devhelp": "1.0.6",
+ "sphinxcontrib-htmlhelp": "2.0.6",
+ "sphinxcontrib-qthelp": "1.0.6",
+ },
+ (8, 2, 0): {
+ "docutils": "0.21.2",
+ "PyYAML": "6.0.1",
+ "sphinxcontrib-serializinghtml": "1.1.9",
+ },
+}
+
+
+class AsyncCommands:
+ """Excecute command synchronously"""
+
+ def __init__(self, fp=None):
+
+ self.stdout = None
+ self.stderr = None
+ self.output = None
+ self.fp = fp
+
+ def log(self, out, verbose, is_info=True):
+ out = out.removesuffix('\n')
+
+ if verbose:
+ if is_info:
+ print(out)
+ else:
+ print(out, file=sys.stderr)
+
+ if self.fp:
+ self.fp.write(out + "\n")
+
+ async def _read(self, stream, verbose, is_info):
+ """Ancillary routine to capture while displaying"""
+
+ while stream is not None:
+ line = await stream.readline()
+ if line:
+ out = line.decode("utf-8", errors="backslashreplace")
+ self.log(out, verbose, is_info)
+ if is_info:
+ self.stdout += out
+ else:
+ self.stderr += out
+ else:
+ break
+
+ async def run(self, cmd, capture_output=False, check=False,
+ env=None, verbose=True):
+
+ """
+ Execute an arbitrary command, handling errors.
+
+ Please notice that this class is not thread safe
+ """
+
+ self.stdout = ""
+ self.stderr = ""
+
+ self.log("$ " + " ".join(cmd), verbose)
+
+ proc = await asyncio.create_subprocess_exec(cmd[0],
+ *cmd[1:],
+ env=env,
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE)
+
+ # Handle input and output in realtime
+ await asyncio.gather(
+ self._read(proc.stdout, verbose, True),
+ self._read(proc.stderr, verbose, False),
+ )
+
+ await proc.wait()
+
+ if check and proc.returncode > 0:
+ raise subprocess.CalledProcessError(returncode=proc.returncode,
+ cmd=" ".join(cmd),
+ output=self.stdout,
+ stderr=self.stderr)
+
+ if capture_output:
+ if proc.returncode > 0:
+ self.log(f"Error {proc.returncode}", verbose=True, is_info=False)
+ return ""
+
+ return self.output
+
+ ret = subprocess.CompletedProcess(args=cmd,
+ returncode=proc.returncode,
+ stdout=self.stdout,
+ stderr=self.stderr)
+
+ return ret
+
+
+class SphinxVenv:
+ """
+ Installs Sphinx on one virtual env per Sphinx version with a minimal
+ set of dependencies, adjusting them to each specific version.
+ """
+
+ def __init__(self):
+ """Initialize instance variables"""
+
+ self.built_time = {}
+ self.first_run = True
+
+ async def _handle_version(self, args, fp,
+ cur_ver, cur_requirements, python_bin):
+ """Handle a single Sphinx version"""
+
+ cmd = AsyncCommands(fp)
+
+ ver = ".".join(map(str, cur_ver))
+
+ if not self.first_run and args.wait_input and args.build:
+ ret = input("Press Enter to continue or 'a' to abort: ").strip().lower()
+ if ret == "a":
+ print("Aborted.")
+ sys.exit()
+ else:
+ self.first_run = False
+
+ venv_dir = f"Sphinx_{ver}"
+ req_file = f"requirements_{ver}.txt"
+
+ cmd.log(f"\nSphinx {ver} with {python_bin}", verbose=True)
+
+ # Create venv
+ await cmd.run([python_bin, "-m", "venv", venv_dir],
+ verbose=args.verbose, check=True)
+ pip = os.path.join(venv_dir, "bin/pip")
+
+ # Create install list
+ reqs = []
+ for pkg, verstr in cur_requirements.items():
+ reqs.append(f"{pkg}=={verstr}")
+
+ reqs.append(f"Sphinx=={ver}")
+
+ await cmd.run([pip, "install"] + reqs, check=True, verbose=args.verbose)
+
+ # Freeze environment
+ result = await cmd.run([pip, "freeze"], verbose=False, check=True)
+
+ # Pip install succeeded. Write requirements file
+ if args.req_file:
+ with open(req_file, "w", encoding="utf-8") as fp:
+ fp.write(result.stdout)
+
+ if args.build:
+ start_time = time.time()
+
+ # Prepare a venv environment
+ env = os.environ.copy()
+ bin_dir = os.path.join(venv_dir, "bin")
+ env["PATH"] = bin_dir + ":" + env["PATH"]
+ env["VIRTUAL_ENV"] = venv_dir
+ if "PYTHONHOME" in env:
+ del env["PYTHONHOME"]
+
+ # Test doc build
+ await cmd.run(["make", "cleandocs"], env=env, check=True)
+ make = ["make"]
+
+ if args.output:
+ sphinx_build = os.path.realpath(f"{bin_dir}/sphinx-build")
+ make += [f"O={args.output}", f"SPHINXBUILD={sphinx_build}"]
+
+ if args.make_args:
+ make += args.make_args
+
+ make += args.targets
+
+ if args.verbose:
+ cmd.log(f". {bin_dir}/activate", verbose=True)
+ await cmd.run(make, env=env, check=True, verbose=True)
+ if args.verbose:
+ cmd.log("deactivate", verbose=True)
+
+ end_time = time.time()
+ elapsed_time = end_time - start_time
+ hours, minutes = divmod(elapsed_time, 3600)
+ minutes, seconds = divmod(minutes, 60)
+
+ hours = int(hours)
+ minutes = int(minutes)
+ seconds = int(seconds)
+
+ self.built_time[ver] = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
+
+ cmd.log(f"Finished doc build for Sphinx {ver}. Elapsed time: {self.built_time[ver]}", verbose=True)
+
+ async def run(self, args):
+ """
+ Navigate though multiple Sphinx versions, handling each of them
+ on a loop.
+ """
+
+ if args.log:
+ fp = open(args.log, "w", encoding="utf-8")
+ if not args.verbose:
+ args.verbose = False
+ else:
+ fp = None
+ if not args.verbose:
+ args.verbose = True
+
+ cur_requirements = {}
+ python_bin = min_python_bin
+
+ vers = set(SPHINX_REQUIREMENTS.keys()) | set(args.versions)
+
+ for cur_ver in sorted(vers):
+ if cur_ver in SPHINX_REQUIREMENTS:
+ new_reqs = SPHINX_REQUIREMENTS[cur_ver]
+ cur_requirements.update(new_reqs)
+
+ if cur_ver in PYTHON_VER_CHANGES: # pylint: disable=R1715
+ python_bin = PYTHON_VER_CHANGES[cur_ver]
+
+ if cur_ver not in args.versions:
+ continue
+
+ if args.min_version:
+ if cur_ver < args.min_version:
+ continue
+
+ if args.max_version:
+ if cur_ver > args.max_version:
+ break
+
+ await self._handle_version(args, fp, cur_ver, cur_requirements,
+ python_bin)
+
+ if args.build:
+ cmd = AsyncCommands(fp)
+ cmd.log("\nSummary:", verbose=True)
+ for ver, elapsed_time in sorted(self.built_time.items()):
+ cmd.log(f"\tSphinx {ver} elapsed time: {elapsed_time}",
+ verbose=True)
+
+ if fp:
+ fp.close()
+
+def parse_version(ver_str):
+ """Convert a version string into a tuple."""
+
+ return tuple(map(int, ver_str.split(".")))
+
+
+DEFAULT_VERS = " - "
+DEFAULT_VERS += "\n - ".join(map(lambda v: f"{v[0]}.{v[1]}.{v[2]}",
+ DEFAULT_VERSIONS_TO_TEST))
+
+SCRIPT = os.path.relpath(__file__)
+
+DESCRIPTION = f"""
+This tool allows creating Python virtual environments for different
+Sphinx versions that are supported by the Linux Kernel build system.
+
+Besides creating the virtual environment, it can also test building
+the documentation using "make htmldocs" (and/or other doc targets).
+
+If called without "--versions" argument, it covers the versions shipped
+on major distros, plus the lowest supported version:
+
+{DEFAULT_VERS}
+
+A typical usage is to run:
+
+ {SCRIPT} -m -l sphinx_builds.log
+
+This will create one virtual env for the default version set and run
+"make htmldocs" for each version, creating a log file with the
+excecuted commands on it.
+
+NOTE: The build time can be very long, specially on old versions. Also, there
+is a known bug with Sphinx version 6.0.x: each subprocess uses a lot of
+memory. That, together with "-jauto" may cause OOM killer to cause
+failures at the doc generation. To minimize the risk, you may use the
+"-a" command line parameter to constrain the built directories and/or
+reduce the number of threads from "-jauto" to, for instance, "-j4":
+
+ {SCRIPT} -m -V 6.0.1 -a "SPHINXDIRS=process" "SPHINXOPTS='-j4'"
+
+"""
+
+MAKE_TARGETS = [
+ "htmldocs",
+ "texinfodocs",
+ "infodocs",
+ "latexdocs",
+ "pdfdocs",
+ "epubdocs",
+ "xmldocs",
+]
+
+async def main():
+ """Main program"""
+
+ parser = argparse.ArgumentParser(description=DESCRIPTION,
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+
+ ver_group = parser.add_argument_group("Version range options")
+
+ ver_group.add_argument('-V', '--versions', nargs="*",
+ default=DEFAULT_VERSIONS_TO_TEST,type=parse_version,
+ help='Sphinx versions to test')
+ ver_group.add_argument('--min-version', "--min", type=parse_version,
+ help='Sphinx minimal version')
+ ver_group.add_argument('--max-version', "--max", type=parse_version,
+ help='Sphinx maximum version')
+ ver_group.add_argument('-f', '--full', action='store_true',
+ help='Add all Sphinx (major,minor) supported versions to the version range')
+
+ build_group = parser.add_argument_group("Build options")
+
+ build_group.add_argument('-b', '--build', action='store_true',
+ help='Build documentation')
+ build_group.add_argument('-a', '--make-args', nargs="*",
+ help='extra arguments for make, like SPHINXDIRS=netlink/specs',
+ )
+ build_group.add_argument('-t', '--targets', nargs="+", choices=MAKE_TARGETS,
+ default=[MAKE_TARGETS[0]],
+ help="make build targets. Default: htmldocs.")
+ build_group.add_argument("-o", '--output',
+ help="output directory for the make O=OUTPUT")
+
+ other_group = parser.add_argument_group("Other options")
+
+ other_group.add_argument('-r', '--req-file', action='store_true',
+ help='write a requirements.txt file')
+ other_group.add_argument('-l', '--log',
+ help='Log command output on a file')
+ other_group.add_argument('-v', '--verbose', action='store_true',
+ help='Verbose all commands')
+ other_group.add_argument('-i', '--wait-input', action='store_true',
+ help='Wait for an enter before going to the next version')
+
+ args = parser.parse_args()
+
+ if not args.make_args:
+ args.make_args = []
+
+ sphinx_versions = sorted(list(SPHINX_REQUIREMENTS.keys()))
+
+ if args.full:
+ args.versions += list(SPHINX_REQUIREMENTS.keys())
+
+ venv = SphinxVenv()
+ await venv.run(args)
+
+
+# Call main method
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/tools/include/asm-generic/bitops/__fls.h b/tools/include/asm-generic/bitops/__fls.h
index e974ec932ec1..35f33780ca6c 100644
--- a/tools/include/asm-generic/bitops/__fls.h
+++ b/tools/include/asm-generic/bitops/__fls.h
@@ -10,7 +10,7 @@
*
* Undefined if no set bit exists, so code should check against 0 first.
*/
-static __always_inline unsigned int generic___fls(unsigned long word)
+static __always_inline __attribute_const__ unsigned int generic___fls(unsigned long word)
{
unsigned int num = BITS_PER_LONG - 1;
diff --git a/tools/include/asm-generic/bitops/fls.h b/tools/include/asm-generic/bitops/fls.h
index 26f3ce1dd6e4..8eed3437edb9 100644
--- a/tools/include/asm-generic/bitops/fls.h
+++ b/tools/include/asm-generic/bitops/fls.h
@@ -10,7 +10,7 @@
* Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
*/
-static __always_inline int generic_fls(unsigned int x)
+static __always_inline __attribute_const__ int generic_fls(unsigned int x)
{
int r = 32;
diff --git a/tools/include/asm-generic/bitops/fls64.h b/tools/include/asm-generic/bitops/fls64.h
index 866f2b2304ff..b5f58dd261a3 100644
--- a/tools/include/asm-generic/bitops/fls64.h
+++ b/tools/include/asm-generic/bitops/fls64.h
@@ -16,7 +16,7 @@
* at position 64.
*/
#if BITS_PER_LONG == 32
-static __always_inline int fls64(__u64 x)
+static __always_inline __attribute_const__ int fls64(__u64 x)
{
__u32 h = x >> 32;
if (h)
@@ -24,7 +24,7 @@ static __always_inline int fls64(__u64 x)
return fls(x);
}
#elif BITS_PER_LONG == 64
-static __always_inline int fls64(__u64 x)
+static __always_inline __attribute_const__ int fls64(__u64 x)
{
if (x == 0)
return 0;
diff --git a/tools/include/linux/interval_tree_generic.h b/tools/include/linux/interval_tree_generic.h
index aaa8a0767aa3..c5a2fed49eb0 100644
--- a/tools/include/linux/interval_tree_generic.h
+++ b/tools/include/linux/interval_tree_generic.h
@@ -77,7 +77,7 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, \
* Cond2: start <= ITLAST(node) \
*/ \
\
-static ITSTRUCT * \
+ITSTATIC ITSTRUCT * \
ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \
{ \
while (true) { \
@@ -104,12 +104,8 @@ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \
if (ITSTART(node) <= last) { /* Cond1 */ \
if (start <= ITLAST(node)) /* Cond2 */ \
return node; /* node is leftmost match */ \
- if (node->ITRB.rb_right) { \
- node = rb_entry(node->ITRB.rb_right, \
- ITSTRUCT, ITRB); \
- if (start <= node->ITSUBTREE) \
- continue; \
- } \
+ node = rb_entry(node->ITRB.rb_right, ITSTRUCT, ITRB); \
+ continue; \
} \
return NULL; /* No match */ \
} \
diff --git a/tools/include/linux/livepatch_external.h b/tools/include/linux/livepatch_external.h
new file mode 100644
index 000000000000..138af19b0f5c
--- /dev/null
+++ b/tools/include/linux/livepatch_external.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * External livepatch interfaces for patch creation tooling
+ */
+
+#ifndef _LINUX_LIVEPATCH_EXTERNAL_H_
+#define _LINUX_LIVEPATCH_EXTERNAL_H_
+
+#include <linux/types.h>
+
+#define KLP_RELOC_SEC_PREFIX ".klp.rela."
+#define KLP_SYM_PREFIX ".klp.sym."
+
+#define __KLP_PRE_PATCH_PREFIX __klp_pre_patch_callback_
+#define __KLP_POST_PATCH_PREFIX __klp_post_patch_callback_
+#define __KLP_PRE_UNPATCH_PREFIX __klp_pre_unpatch_callback_
+#define __KLP_POST_UNPATCH_PREFIX __klp_post_unpatch_callback_
+
+#define KLP_PRE_PATCH_PREFIX __stringify(__KLP_PRE_PATCH_PREFIX)
+#define KLP_POST_PATCH_PREFIX __stringify(__KLP_POST_PATCH_PREFIX)
+#define KLP_PRE_UNPATCH_PREFIX __stringify(__KLP_PRE_UNPATCH_PREFIX)
+#define KLP_POST_UNPATCH_PREFIX __stringify(__KLP_POST_UNPATCH_PREFIX)
+
+struct klp_object;
+
+typedef int (*klp_pre_patch_t)(struct klp_object *obj);
+typedef void (*klp_post_patch_t)(struct klp_object *obj);
+typedef void (*klp_pre_unpatch_t)(struct klp_object *obj);
+typedef void (*klp_post_unpatch_t)(struct klp_object *obj);
+
+/**
+ * struct klp_callbacks - pre/post live-(un)patch callback structure
+ * @pre_patch: executed before code patching
+ * @post_patch: executed after code patching
+ * @pre_unpatch: executed before code unpatching
+ * @post_unpatch: executed after code unpatching
+ * @post_unpatch_enabled: flag indicating if post-unpatch callback
+ * should run
+ *
+ * All callbacks are optional. Only the pre-patch callback, if provided,
+ * will be unconditionally executed. If the parent klp_object fails to
+ * patch for any reason, including a non-zero error status returned from
+ * the pre-patch callback, no further callbacks will be executed.
+ */
+struct klp_callbacks {
+ klp_pre_patch_t pre_patch;
+ klp_post_patch_t post_patch;
+ klp_pre_unpatch_t pre_unpatch;
+ klp_post_unpatch_t post_unpatch;
+ bool post_unpatch_enabled;
+};
+
+/*
+ * 'struct klp_{func,object}_ext' are compact "external" representations of
+ * 'struct klp_{func,object}'. They are used by objtool for livepatch
+ * generation. The structs are then read by the livepatch module and converted
+ * to the real structs before calling klp_enable_patch().
+ *
+ * TODO make these the official API for klp_enable_patch(). That should
+ * simplify livepatch's interface as well as its data structure lifetime
+ * management.
+ */
+struct klp_func_ext {
+ const char *old_name;
+ void *new_func;
+ unsigned long sympos;
+};
+
+struct klp_object_ext {
+ const char *name;
+ struct klp_func_ext *funcs;
+ struct klp_callbacks callbacks;
+ unsigned int nr_funcs;
+};
+
+#endif /* _LINUX_LIVEPATCH_EXTERNAL_H_ */
diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
index aceac94632c8..c6def4049b1a 100644
--- a/tools/include/linux/objtool_types.h
+++ b/tools/include/linux/objtool_types.h
@@ -67,4 +67,6 @@ struct unwind_hint {
#define ANNOTYPE_REACHABLE 8
#define ANNOTYPE_NOCFI 9
+#define ANNOTYPE_DATA_SPECIAL 1
+
#endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h
index 8499f509f03e..51ad3cf4fa82 100644
--- a/tools/include/linux/string.h
+++ b/tools/include/linux/string.h
@@ -44,6 +44,20 @@ static inline bool strstarts(const char *str, const char *prefix)
return strncmp(str, prefix, strlen(prefix)) == 0;
}
+/*
+ * Checks if a string ends with another.
+ */
+static inline bool str_ends_with(const char *str, const char *substr)
+{
+ size_t len = strlen(str);
+ size_t sublen = strlen(substr);
+
+ if (sublen > len)
+ return false;
+
+ return !strcmp(str + len - sublen, substr);
+}
+
extern char * __must_check skip_spaces(const char *);
extern char *strim(char *);
diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile
index 143c2d2c2ba6..8118e22844f1 100644
--- a/tools/include/nolibc/Makefile
+++ b/tools/include/nolibc/Makefile
@@ -23,7 +23,7 @@ else
Q=@
endif
-arch_file := arch-$(ARCH).h
+arch_files := arch.h $(wildcard arch-*.h)
all_files := \
compiler.h \
crt.h \
@@ -33,6 +33,7 @@ all_files := \
errno.h \
fcntl.h \
getopt.h \
+ inttypes.h \
limits.h \
math.h \
nolibc.h \
@@ -56,12 +57,14 @@ all_files := \
sys/random.h \
sys/reboot.h \
sys/resource.h \
+ sys/select.h \
sys/stat.h \
sys/syscall.h \
sys/sysmacros.h \
sys/time.h \
sys/timerfd.h \
sys/types.h \
+ sys/uio.h \
sys/utsname.h \
sys/wait.h \
time.h \
@@ -79,7 +82,7 @@ help:
@echo "Supported targets under nolibc:"
@echo " all call \"headers\""
@echo " clean clean the sysroot"
- @echo " headers prepare a sysroot in tools/include/nolibc/sysroot"
+ @echo " headers prepare a multi-arch sysroot in \$${OUTPUT}sysroot"
@echo " headers_standalone like \"headers\", and also install kernel headers"
@echo " help this help"
@echo ""
@@ -90,18 +93,11 @@ help:
@echo " OUTPUT = $(OUTPUT)"
@echo ""
+# installs headers for all archs at once.
headers:
- $(Q)mkdir -p $(OUTPUT)sysroot
- $(Q)mkdir -p $(OUTPUT)sysroot/include
- $(Q)cp --parents $(all_files) $(OUTPUT)sysroot/include/
- $(Q)if [ "$(ARCH)" = "i386" -o "$(ARCH)" = "x86_64" ]; then \
- cat arch-x86.h; \
- elif [ -e "$(arch_file)" ]; then \
- cat $(arch_file); \
- else \
- echo "Fatal: architecture $(ARCH) not yet supported by nolibc." >&2; \
- exit 1; \
- fi > $(OUTPUT)sysroot/include/arch.h
+ $(Q)mkdir -p "$(OUTPUT)sysroot"
+ $(Q)mkdir -p "$(OUTPUT)sysroot/include"
+ $(Q)cp --parents $(arch_files) $(all_files) "$(OUTPUT)sysroot/include/"
headers_standalone: headers
$(Q)$(MAKE) -C $(srctree) headers
diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h
index 1f66e7e5a444..251c42579028 100644
--- a/tools/include/nolibc/arch-arm.h
+++ b/tools/include/nolibc/arch-arm.h
@@ -184,6 +184,7 @@
_arg1; \
})
+#ifndef NOLIBC_NO_RUNTIME
/* startup code */
void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
{
@@ -193,5 +194,6 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s
);
__nolibc_entrypoint_epilogue();
}
+#endif /* NOLIBC_NO_RUNTIME */
#endif /* _NOLIBC_ARCH_ARM_H */
diff --git a/tools/include/nolibc/arch-arm64.h b/tools/include/nolibc/arch-arm64.h
index 02a3f74c8ec8..080a55a7144e 100644
--- a/tools/include/nolibc/arch-arm64.h
+++ b/tools/include/nolibc/arch-arm64.h
@@ -141,6 +141,7 @@
_arg1; \
})
+#ifndef NOLIBC_NO_RUNTIME
/* startup code */
void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
{
@@ -150,4 +151,5 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s
);
__nolibc_entrypoint_epilogue();
}
+#endif /* NOLIBC_NO_RUNTIME */
#endif /* _NOLIBC_ARCH_ARM64_H */
diff --git a/tools/include/nolibc/arch-loongarch.h b/tools/include/nolibc/arch-loongarch.h
index 5511705303ea..c894176c3f89 100644
--- a/tools/include/nolibc/arch-loongarch.h
+++ b/tools/include/nolibc/arch-loongarch.h
@@ -142,6 +142,7 @@
_arg1; \
})
+#ifndef NOLIBC_NO_RUNTIME
/* startup code */
void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
{
@@ -151,5 +152,6 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s
);
__nolibc_entrypoint_epilogue();
}
+#endif /* NOLIBC_NO_RUNTIME */
#endif /* _NOLIBC_ARCH_LOONGARCH_H */
diff --git a/tools/include/nolibc/arch-m68k.h b/tools/include/nolibc/arch-m68k.h
index 6dac1845f298..2a4fbada5e79 100644
--- a/tools/include/nolibc/arch-m68k.h
+++ b/tools/include/nolibc/arch-m68k.h
@@ -128,6 +128,7 @@
_num; \
})
+#ifndef NOLIBC_NO_RUNTIME
void _start(void);
void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
{
@@ -137,5 +138,6 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s
);
__nolibc_entrypoint_epilogue();
}
+#endif /* NOLIBC_NO_RUNTIME */
#endif /* _NOLIBC_ARCH_M68K_H */
diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h
index 0cbac63b249a..a72506ceec6b 100644
--- a/tools/include/nolibc/arch-mips.h
+++ b/tools/include/nolibc/arch-mips.h
@@ -245,6 +245,7 @@
#endif /* _ABIO32 */
+#ifndef NOLIBC_NO_RUNTIME
/* startup code, note that it's called __start on MIPS */
void __start(void);
void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __start(void)
@@ -266,5 +267,6 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __
);
__nolibc_entrypoint_epilogue();
}
+#endif /* NOLIBC_NO_RUNTIME */
#endif /* _NOLIBC_ARCH_MIPS_H */
diff --git a/tools/include/nolibc/arch-powerpc.h b/tools/include/nolibc/arch-powerpc.h
index 204564bbcd32..e0c7e0b81f7c 100644
--- a/tools/include/nolibc/arch-powerpc.h
+++ b/tools/include/nolibc/arch-powerpc.h
@@ -183,6 +183,7 @@
#endif
#endif /* !__powerpc64__ */
+#ifndef NOLIBC_NO_RUNTIME
/* startup code */
void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
{
@@ -215,5 +216,6 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s
#endif
__nolibc_entrypoint_epilogue();
}
+#endif /* NOLIBC_NO_RUNTIME */
#endif /* _NOLIBC_ARCH_POWERPC_H */
diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h
index 885383a86c38..1c00cacf57e1 100644
--- a/tools/include/nolibc/arch-riscv.h
+++ b/tools/include/nolibc/arch-riscv.h
@@ -139,6 +139,7 @@
_arg1; \
})
+#ifndef NOLIBC_NO_RUNTIME
/* startup code */
void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
{
@@ -152,5 +153,6 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s
);
__nolibc_entrypoint_epilogue();
}
+#endif /* NOLIBC_NO_RUNTIME */
#endif /* _NOLIBC_ARCH_RISCV_H */
diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h
index df4c3cc713ac..74125a254ce3 100644
--- a/tools/include/nolibc/arch-s390.h
+++ b/tools/include/nolibc/arch-s390.h
@@ -139,22 +139,19 @@
_arg1; \
})
+#ifndef NOLIBC_NO_RUNTIME
/* startup code */
void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
{
__asm__ volatile (
-#ifdef __s390x__
"lgr %r2, %r15\n" /* save stack pointer to %r2, as arg1 of _start_c */
"aghi %r15, -160\n" /* allocate new stackframe */
-#else
- "lr %r2, %r15\n"
- "ahi %r15, -96\n"
-#endif
"xc 0(8,%r15), 0(%r15)\n" /* clear backchain */
"brasl %r14, _start_c\n" /* transfer to c runtime */
);
__nolibc_entrypoint_epilogue();
}
+#endif /* NOLIBC_NO_RUNTIME */
struct s390_mmap_arg_struct {
unsigned long addr;
diff --git a/tools/include/nolibc/arch-sh.h b/tools/include/nolibc/arch-sh.h
index a96b8914607e..7a421197d104 100644
--- a/tools/include/nolibc/arch-sh.h
+++ b/tools/include/nolibc/arch-sh.h
@@ -140,6 +140,7 @@
_ret; \
})
+#ifndef NOLIBC_NO_RUNTIME
/* startup code */
void _start_wrapper(void);
void __attribute__((weak,noreturn)) __nolibc_entrypoint __no_stack_protector _start_wrapper(void)
@@ -158,5 +159,6 @@ void __attribute__((weak,noreturn)) __nolibc_entrypoint __no_stack_protector _st
);
__nolibc_entrypoint_epilogue();
}
+#endif /* NOLIBC_NO_RUNTIME */
#endif /* _NOLIBC_ARCH_SH_H */
diff --git a/tools/include/nolibc/arch-sparc.h b/tools/include/nolibc/arch-sparc.h
index ca420d843e25..2ebb5686e105 100644
--- a/tools/include/nolibc/arch-sparc.h
+++ b/tools/include/nolibc/arch-sparc.h
@@ -152,6 +152,7 @@
_arg1; \
})
+#ifndef NOLIBC_NO_RUNTIME
/* startup code */
void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
{
@@ -169,6 +170,7 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s
);
__nolibc_entrypoint_epilogue();
}
+#endif /* NOLIBC_NO_RUNTIME */
static pid_t getpid(void);
diff --git a/tools/include/nolibc/arch-x86.h b/tools/include/nolibc/arch-x86.h
index d3efc0c3b8ad..f6c43ac5377b 100644
--- a/tools/include/nolibc/arch-x86.h
+++ b/tools/include/nolibc/arch-x86.h
@@ -157,6 +157,7 @@
_eax; \
})
+#ifndef NOLIBC_NO_RUNTIME
/* startup code */
/*
* i386 System V ABI mandates:
@@ -176,6 +177,7 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s
);
__nolibc_entrypoint_epilogue();
}
+#endif /* NOLIBC_NO_RUNTIME */
#else /* !defined(__x86_64__) */
@@ -323,6 +325,7 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s
_ret; \
})
+#ifndef NOLIBC_NO_RUNTIME
/* startup code */
/*
* x86-64 System V ABI mandates:
@@ -340,6 +343,7 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s
);
__nolibc_entrypoint_epilogue();
}
+#endif /* NOLIBC_NO_RUNTIME */
#define NOLIBC_ARCH_HAS_MEMMOVE
void *memmove(void *dst, const void *src, size_t len);
@@ -351,7 +355,7 @@ void *memcpy(void *dst, const void *src, size_t len);
void *memset(void *dst, int c, size_t len);
__asm__ (
-".section .text.nolibc_memmove_memcpy\n"
+".pushsection .text.nolibc_memmove_memcpy\n"
".weak memmove\n"
".weak memcpy\n"
"memmove:\n"
@@ -371,8 +375,9 @@ __asm__ (
"rep movsb\n\t"
"cld\n\t"
"retq\n"
+".popsection\n"
-".section .text.nolibc_memset\n"
+".pushsection .text.nolibc_memset\n"
".weak memset\n"
"memset:\n"
"xchgl %eax, %esi\n\t"
@@ -381,6 +386,7 @@ __asm__ (
"rep stosb\n\t"
"popq %rax\n\t"
"retq\n"
+".popsection\n"
);
#endif /* !defined(__x86_64__) */
diff --git a/tools/include/nolibc/arch.h b/tools/include/nolibc/arch.h
index 426c89198135..a3adaf433f2c 100644
--- a/tools/include/nolibc/arch.h
+++ b/tools/include/nolibc/arch.h
@@ -3,15 +3,6 @@
* Copyright (C) 2017-2022 Willy Tarreau <w@1wt.eu>
*/
-/* Below comes the architecture-specific code. For each architecture, we have
- * the syscall declarations and the _start code definition. This is the only
- * global part. On all architectures the kernel puts everything in the stack
- * before jumping to _start just above us, without any return address (_start
- * is not a function but an entry point). So at the stack pointer we find argc.
- * Then argv[] begins, and ends at the first NULL. Then we have envp which
- * starts and ends with a NULL as well. So envp=argv+argc+1.
- */
-
#ifndef _NOLIBC_ARCH_H
#define _NOLIBC_ARCH_H
@@ -27,7 +18,7 @@
#include "arch-powerpc.h"
#elif defined(__riscv)
#include "arch-riscv.h"
-#elif defined(__s390x__) || defined(__s390__)
+#elif defined(__s390x__)
#include "arch-s390.h"
#elif defined(__loongarch__)
#include "arch-loongarch.h"
diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h
index 369cfb5a0e78..87090bbc53e0 100644
--- a/tools/include/nolibc/compiler.h
+++ b/tools/include/nolibc/compiler.h
@@ -41,8 +41,8 @@
# define __no_stack_protector __attribute__((__optimize__("-fno-stack-protector")))
#endif /* __nolibc_has_attribute(no_stack_protector) */
-#if __nolibc_has_attribute(fallthrough)
-# define __nolibc_fallthrough do { } while (0); __attribute__((fallthrough))
+#if __nolibc_has_attribute(__fallthrough__)
+# define __nolibc_fallthrough do { } while (0); __attribute__((__fallthrough__))
#else
# define __nolibc_fallthrough do { } while (0)
#endif /* __nolibc_has_attribute(fallthrough) */
diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h
index 961cfe777c35..d9262998dae9 100644
--- a/tools/include/nolibc/crt.h
+++ b/tools/include/nolibc/crt.h
@@ -7,6 +7,8 @@
#ifndef _NOLIBC_CRT_H
#define _NOLIBC_CRT_H
+#ifndef NOLIBC_NO_RUNTIME
+
#include "compiler.h"
char **environ __attribute__((weak));
@@ -88,4 +90,5 @@ void _start_c(long *sp)
exit(exitcode);
}
+#endif /* NOLIBC_NO_RUNTIME */
#endif /* _NOLIBC_CRT_H */
diff --git a/tools/include/nolibc/dirent.h b/tools/include/nolibc/dirent.h
index 758b95c48e7a..61a122a60327 100644
--- a/tools/include/nolibc/dirent.h
+++ b/tools/include/nolibc/dirent.h
@@ -86,9 +86,9 @@ int readdir_r(DIR *dirp, struct dirent *entry, struct dirent **result)
* readdir() can only return one entry at a time.
* Make sure the non-returned ones are not skipped.
*/
- ret = lseek(fd, ldir->d_off, SEEK_SET);
- if (ret == -1)
- return errno;
+ ret = sys_lseek(fd, ldir->d_off, SEEK_SET);
+ if (ret < 0)
+ return -ret;
entry->d_ino = ldir->d_ino;
/* the destination should always be big enough */
diff --git a/tools/include/nolibc/getopt.h b/tools/include/nolibc/getopt.h
index 217abb95264b..87565e3b6a33 100644
--- a/tools/include/nolibc/getopt.h
+++ b/tools/include/nolibc/getopt.h
@@ -78,7 +78,7 @@ int getopt(int argc, char * const argv[], const char *optstring)
return '?';
}
if (optstring[i] == ':') {
- optarg = 0;
+ optarg = NULL;
if (optstring[i + 1] != ':' || __optpos) {
optarg = argv[optind++];
if (__optpos)
diff --git a/tools/include/nolibc/inttypes.h b/tools/include/nolibc/inttypes.h
new file mode 100644
index 000000000000..1977bd74bfeb
--- /dev/null
+++ b/tools/include/nolibc/inttypes.h
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+
+#include "nolibc.h"
diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h
index d2f5aa085f8e..272dfc961158 100644
--- a/tools/include/nolibc/nolibc.h
+++ b/tools/include/nolibc/nolibc.h
@@ -104,11 +104,13 @@
#include "sys/random.h"
#include "sys/reboot.h"
#include "sys/resource.h"
+#include "sys/select.h"
#include "sys/stat.h"
#include "sys/syscall.h"
#include "sys/sysmacros.h"
#include "sys/time.h"
#include "sys/timerfd.h"
+#include "sys/uio.h"
#include "sys/utsname.h"
#include "sys/wait.h"
#include "ctype.h"
diff --git a/tools/include/nolibc/stackprotector.h b/tools/include/nolibc/stackprotector.h
index c71a2c257177..7123aa056cb0 100644
--- a/tools/include/nolibc/stackprotector.h
+++ b/tools/include/nolibc/stackprotector.h
@@ -9,6 +9,7 @@
#include "compiler.h"
+#ifndef NOLIBC_NO_RUNTIME
#if defined(_NOLIBC_STACKPROTECTOR)
#include "sys.h"
@@ -49,5 +50,6 @@ static __no_stack_protector void __stack_chk_init(void)
#else /* !defined(_NOLIBC_STACKPROTECTOR) */
static void __stack_chk_init(void) {}
#endif /* defined(_NOLIBC_STACKPROTECTOR) */
+#endif /* NOLIBC_NO_RUNTIME */
#endif /* _NOLIBC_STACKPROTECTOR_H */
diff --git a/tools/include/nolibc/std.h b/tools/include/nolibc/std.h
index 2c1ad23b9b5c..392f4dd94158 100644
--- a/tools/include/nolibc/std.h
+++ b/tools/include/nolibc/std.h
@@ -20,13 +20,13 @@
/* those are commonly provided by sys/types.h */
typedef unsigned int dev_t;
-typedef unsigned long ino_t;
+typedef uint64_t ino_t;
typedef unsigned int mode_t;
typedef signed int pid_t;
typedef unsigned int uid_t;
typedef unsigned int gid_t;
typedef unsigned long nlink_t;
-typedef signed long off_t;
+typedef int64_t off_t;
typedef signed long blksize_t;
typedef signed long blkcnt_t;
typedef __kernel_time_t time_t;
diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h
index 7630234408c5..1f16dab2ac88 100644
--- a/tools/include/nolibc/stdio.h
+++ b/tools/include/nolibc/stdio.h
@@ -321,11 +321,13 @@ int __nolibc_printf(__nolibc_printf_cb cb, intptr_t state, size_t n, const char
if (!outstr)
outstr="(null)";
}
-#ifndef NOLIBC_IGNORE_ERRNO
else if (c == 'm') {
+#ifdef NOLIBC_IGNORE_ERRNO
+ outstr = "unknown error";
+#else
outstr = strerror(errno);
- }
#endif /* NOLIBC_IGNORE_ERRNO */
+ }
else if (c == '%') {
/* queue it verbatim */
continue;
@@ -600,7 +602,11 @@ int sscanf(const char *str, const char *format, ...)
static __attribute__((unused))
void perror(const char *msg)
{
+#ifdef NOLIBC_IGNORE_ERRNO
+ fprintf(stderr, "%s%sunknown error\n", (msg && *msg) ? msg : "", (msg && *msg) ? ": " : "");
+#else
fprintf(stderr, "%s%serrno=%d\n", (msg && *msg) ? msg : "", (msg && *msg) ? ": " : "", errno);
+#endif
}
static __attribute__((unused))
diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h
index 5fd99a480f82..f184e108ed0a 100644
--- a/tools/include/nolibc/stdlib.h
+++ b/tools/include/nolibc/stdlib.h
@@ -100,6 +100,7 @@ void free(void *ptr)
munmap(heap, heap->len);
}
+#ifndef NOLIBC_NO_RUNTIME
/* getenv() tries to find the environment variable named <name> in the
* environment array pointed to by global variable "environ" which must be
* declared as a char **, and must be terminated by a NULL (it is recommended
@@ -122,6 +123,7 @@ char *getenv(const char *name)
}
return NULL;
}
+#endif /* NOLIBC_NO_RUNTIME */
static __attribute__((unused))
void *malloc(size_t len)
diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h
index 163a17e7dd38..4000926f44ac 100644
--- a/tools/include/nolibc/string.h
+++ b/tools/include/nolibc/string.h
@@ -93,6 +93,21 @@ void *memset(void *dst, int b, size_t len)
}
#endif /* #ifndef NOLIBC_ARCH_HAS_MEMSET */
+#ifndef NOLIBC_ARCH_HAS_MEMCHR
+static __attribute__((unused))
+void *memchr(const void *s, int c, size_t len)
+{
+ char *p = (char *)s;
+
+ while (len--) {
+ if (*p == (char)c)
+ return p;
+ p++;
+ }
+ return NULL;
+}
+#endif /* #ifndef NOLIBC_ARCH_HAS_MEMCHR */
+
static __attribute__((unused))
char *strchr(const char *s, int c)
{
diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h
index c5564f57deec..847af1ccbdc9 100644
--- a/tools/include/nolibc/sys.h
+++ b/tools/include/nolibc/sys.h
@@ -106,7 +106,7 @@ static __attribute__((unused))
void *sbrk(intptr_t inc)
{
/* first call to find current end */
- void *ret = sys_brk(0);
+ void *ret = sys_brk(NULL);
if (ret && sys_brk(ret + inc) == ret + inc)
return ret + inc;
@@ -118,6 +118,7 @@ void *sbrk(intptr_t inc)
/*
* int chdir(const char *path);
+ * int fchdir(int fildes);
*/
static __attribute__((unused))
@@ -132,6 +133,18 @@ int chdir(const char *path)
return __sysret(sys_chdir(path));
}
+static __attribute__((unused))
+int sys_fchdir(int fildes)
+{
+ return my_syscall1(__NR_fchdir, fildes);
+}
+
+static __attribute__((unused))
+int fchdir(int fildes)
+{
+ return __sysret(sys_fchdir(fildes));
+}
+
/*
* int chmod(const char *path, mode_t mode);
@@ -512,6 +525,7 @@ pid_t gettid(void)
return sys_gettid();
}
+#ifndef NOLIBC_NO_RUNTIME
static unsigned long getauxval(unsigned long key);
/*
@@ -523,7 +537,7 @@ int getpagesize(void)
{
return __sysret((int)getauxval(AT_PAGESZ) ?: -ENOENT);
}
-
+#endif /* NOLIBC_NO_RUNTIME */
/*
* uid_t getuid(void);
@@ -591,23 +605,20 @@ int link(const char *old, const char *new)
static __attribute__((unused))
off_t sys_lseek(int fd, off_t offset, int whence)
{
-#if defined(__NR_lseek)
- return my_syscall3(__NR_lseek, fd, offset, whence);
-#else
+#if defined(__NR_llseek)
__kernel_loff_t loff = 0;
off_t result;
int ret;
- /* Only exists on 32bit where nolibc off_t is also 32bit */
- ret = my_syscall5(__NR_llseek, fd, 0, offset, &loff, whence);
+ ret = my_syscall5(__NR_llseek, fd, offset >> 32, (uint32_t)offset, &loff, whence);
if (ret < 0)
result = ret;
- else if (loff != (off_t)loff)
- result = -EOVERFLOW;
else
result = loff;
return result;
+#else
+ return my_syscall3(__NR_lseek, fd, offset, whence);
#endif
}
@@ -756,51 +767,6 @@ int sched_yield(void)
/*
- * int select(int nfds, fd_set *read_fds, fd_set *write_fds,
- * fd_set *except_fds, struct timeval *timeout);
- */
-
-static __attribute__((unused))
-int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout)
-{
-#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect)
- struct sel_arg_struct {
- unsigned long n;
- fd_set *r, *w, *e;
- struct timeval *t;
- } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout };
- return my_syscall1(__NR_select, &arg);
-#elif defined(__NR__newselect)
- return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout);
-#elif defined(__NR_select)
- return my_syscall5(__NR_select, nfds, rfds, wfds, efds, timeout);
-#elif defined(__NR_pselect6)
- struct timespec t;
-
- if (timeout) {
- t.tv_sec = timeout->tv_sec;
- t.tv_nsec = timeout->tv_usec * 1000;
- }
- return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL);
-#else
- struct __kernel_timespec t;
-
- if (timeout) {
- t.tv_sec = timeout->tv_sec;
- t.tv_nsec = timeout->tv_usec * 1000;
- }
- return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL);
-#endif
-}
-
-static __attribute__((unused))
-int select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout)
-{
- return __sysret(sys_select(nfds, rfds, wfds, efds, timeout));
-}
-
-
-/*
* int setpgid(pid_t pid, pid_t pgid);
*/
diff --git a/tools/include/nolibc/sys/auxv.h b/tools/include/nolibc/sys/auxv.h
index c52463d6c18d..0e98325e7347 100644
--- a/tools/include/nolibc/sys/auxv.h
+++ b/tools/include/nolibc/sys/auxv.h
@@ -10,6 +10,8 @@
#ifndef _NOLIBC_SYS_AUXV_H
#define _NOLIBC_SYS_AUXV_H
+#ifndef NOLIBC_NO_RUNTIME
+
#include "../crt.h"
static __attribute__((unused))
@@ -38,4 +40,5 @@ unsigned long getauxval(unsigned long type)
return ret;
}
+#endif /* NOLIBC_NO_RUNTIME */
#endif /* _NOLIBC_SYS_AUXV_H */
diff --git a/tools/include/nolibc/sys/mman.h b/tools/include/nolibc/sys/mman.h
index 5228751b458c..77084ac3405a 100644
--- a/tools/include/nolibc/sys/mman.h
+++ b/tools/include/nolibc/sys/mman.h
@@ -31,11 +31,6 @@ void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd,
}
#endif
-/* Note that on Linux, MAP_FAILED is -1 so we can use the generic __sysret()
- * which returns -1 upon error and still satisfy user land that checks for
- * MAP_FAILED.
- */
-
static __attribute__((unused))
void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
{
diff --git a/tools/include/nolibc/sys/reboot.h b/tools/include/nolibc/sys/reboot.h
index 4a1e435be669..38274c64a722 100644
--- a/tools/include/nolibc/sys/reboot.h
+++ b/tools/include/nolibc/sys/reboot.h
@@ -28,7 +28,7 @@ ssize_t sys_reboot(int magic1, int magic2, int cmd, void *arg)
static __attribute__((unused))
int reboot(int cmd)
{
- return __sysret(sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, 0));
+ return __sysret(sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, NULL));
}
#endif /* _NOLIBC_SYS_REBOOT_H */
diff --git a/tools/include/nolibc/sys/select.h b/tools/include/nolibc/sys/select.h
new file mode 100644
index 000000000000..2a5619c01277
--- /dev/null
+++ b/tools/include/nolibc/sys/select.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+
+#include "../nolibc.h"
+
+#ifndef _NOLIBC_SYS_SELECT_H
+#define _NOLIBC_SYS_SELECT_H
+
+#include <linux/time.h>
+#include <linux/unistd.h>
+
+/* commonly an fd_set represents 256 FDs */
+#ifndef FD_SETSIZE
+#define FD_SETSIZE 256
+#endif
+
+#define FD_SETIDXMASK (8 * sizeof(unsigned long))
+#define FD_SETBITMASK (8 * sizeof(unsigned long)-1)
+
+/* for select() */
+typedef struct {
+ unsigned long fds[(FD_SETSIZE + FD_SETBITMASK) / FD_SETIDXMASK];
+} fd_set;
+
+#define FD_CLR(fd, set) do { \
+ fd_set *__set = (set); \
+ int __fd = (fd); \
+ if (__fd >= 0) \
+ __set->fds[__fd / FD_SETIDXMASK] &= \
+ ~(1U << (__fd & FD_SETBITMASK)); \
+ } while (0)
+
+#define FD_SET(fd, set) do { \
+ fd_set *__set = (set); \
+ int __fd = (fd); \
+ if (__fd >= 0) \
+ __set->fds[__fd / FD_SETIDXMASK] |= \
+ 1 << (__fd & FD_SETBITMASK); \
+ } while (0)
+
+#define FD_ISSET(fd, set) ({ \
+ fd_set *__set = (set); \
+ int __fd = (fd); \
+ int __r = 0; \
+ if (__fd >= 0) \
+ __r = !!(__set->fds[__fd / FD_SETIDXMASK] & \
+1U << (__fd & FD_SETBITMASK)); \
+ __r; \
+ })
+
+#define FD_ZERO(set) do { \
+ fd_set *__set = (set); \
+ int __idx; \
+ int __size = (FD_SETSIZE+FD_SETBITMASK) / FD_SETIDXMASK;\
+ for (__idx = 0; __idx < __size; __idx++) \
+ __set->fds[__idx] = 0; \
+ } while (0)
+
+/*
+ * int select(int nfds, fd_set *read_fds, fd_set *write_fds,
+ * fd_set *except_fds, struct timeval *timeout);
+ */
+
+static __attribute__((unused))
+int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout)
+{
+#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect)
+ struct sel_arg_struct {
+ unsigned long n;
+ fd_set *r, *w, *e;
+ struct timeval *t;
+ } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout };
+ return my_syscall1(__NR_select, &arg);
+#elif defined(__NR__newselect)
+ return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout);
+#elif defined(__NR_select)
+ return my_syscall5(__NR_select, nfds, rfds, wfds, efds, timeout);
+#elif defined(__NR_pselect6)
+ struct timespec t;
+
+ if (timeout) {
+ t.tv_sec = timeout->tv_sec;
+ t.tv_nsec = timeout->tv_usec * 1000;
+ }
+ return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL);
+#else
+ struct __kernel_timespec t;
+
+ if (timeout) {
+ t.tv_sec = timeout->tv_sec;
+ t.tv_nsec = timeout->tv_usec * 1000;
+ }
+ return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL);
+#endif
+}
+
+static __attribute__((unused))
+int select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout)
+{
+ return __sysret(sys_select(nfds, rfds, wfds, efds, timeout));
+}
+
+
+#endif /* _NOLIBC_SYS_SELECT_H */
diff --git a/tools/include/nolibc/sys/uio.h b/tools/include/nolibc/sys/uio.h
new file mode 100644
index 000000000000..7ad42b927d2f
--- /dev/null
+++ b/tools/include/nolibc/sys/uio.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * uio for NOLIBC
+ * Copyright (C) 2017-2021 Willy Tarreau <w@1wt.eu>
+ * Copyright (C) 2025 Intel Corporation
+ */
+
+/* make sure to include all global symbols */
+#include "../nolibc.h"
+
+#ifndef _NOLIBC_SYS_UIO_H
+#define _NOLIBC_SYS_UIO_H
+
+#include "../sys.h"
+#include <linux/uio.h>
+
+
+/*
+ * ssize_t readv(int fd, const struct iovec *iovec, int count);
+ */
+static __attribute__((unused))
+ssize_t sys_readv(int fd, const struct iovec *iovec, int count)
+{
+ return my_syscall3(__NR_readv, fd, iovec, count);
+}
+
+static __attribute__((unused))
+ssize_t readv(int fd, const struct iovec *iovec, int count)
+{
+ return __sysret(sys_readv(fd, iovec, count));
+}
+
+/*
+ * ssize_t writev(int fd, const struct iovec *iovec, int count);
+ */
+static __attribute__((unused))
+ssize_t sys_writev(int fd, const struct iovec *iovec, int count)
+{
+ return my_syscall3(__NR_writev, fd, iovec, count);
+}
+
+static __attribute__((unused))
+ssize_t writev(int fd, const struct iovec *iovec, int count)
+{
+ return __sysret(sys_writev(fd, iovec, count));
+}
+
+
+#endif /* _NOLIBC_SYS_UIO_H */
diff --git a/tools/include/nolibc/sys/wait.h b/tools/include/nolibc/sys/wait.h
index 4e66e1f7a03e..9d9319ba92cb 100644
--- a/tools/include/nolibc/sys/wait.h
+++ b/tools/include/nolibc/sys/wait.h
@@ -65,23 +65,29 @@ pid_t waitpid(pid_t pid, int *status, int options)
switch (info.si_code) {
case 0:
- *status = 0;
+ if (status)
+ *status = 0;
break;
case CLD_EXITED:
- *status = (info.si_status & 0xff) << 8;
+ if (status)
+ *status = (info.si_status & 0xff) << 8;
break;
case CLD_KILLED:
- *status = info.si_status & 0x7f;
+ if (status)
+ *status = info.si_status & 0x7f;
break;
case CLD_DUMPED:
- *status = (info.si_status & 0x7f) | 0x80;
+ if (status)
+ *status = (info.si_status & 0x7f) | 0x80;
break;
case CLD_STOPPED:
case CLD_TRAPPED:
- *status = (info.si_status << 8) + 0x7f;
+ if (status)
+ *status = (info.si_status << 8) + 0x7f;
break;
case CLD_CONTINUED:
- *status = 0xffff;
+ if (status)
+ *status = 0xffff;
break;
default:
return -1;
diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h
index 6c276b8d646a..48e78f8becf9 100644
--- a/tools/include/nolibc/time.h
+++ b/tools/include/nolibc/time.h
@@ -89,13 +89,11 @@ int sys_clock_settime(clockid_t clockid, struct timespec *tp)
{
#if defined(__NR_clock_settime)
return my_syscall2(__NR_clock_settime, clockid, tp);
-#elif defined(__NR_clock_settime64)
+#else
struct __kernel_timespec ktp;
__nolibc_timespec_user_to_kernel(tp, &ktp);
return my_syscall2(__NR_clock_settime64, clockid, &ktp);
-#else
- return __nolibc_enosys(__func__, clockid, tp);
#endif
}
@@ -111,7 +109,7 @@ int sys_clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqt
{
#if defined(__NR_clock_nanosleep)
return my_syscall4(__NR_clock_nanosleep, clockid, flags, rqtp, rmtp);
-#elif defined(__NR_clock_nanosleep_time64)
+#else
struct __kernel_timespec krqtp, krmtp;
int ret;
@@ -120,8 +118,6 @@ int sys_clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqt
if (rmtp)
__nolibc_timespec_kernel_to_user(&krmtp, rmtp);
return ret;
-#else
- return __nolibc_enosys(__func__, clockid, flags, rqtp, rmtp);
#endif
}
@@ -195,7 +191,7 @@ int sys_timer_gettime(timer_t timerid, struct itimerspec *curr_value)
{
#if defined(__NR_timer_gettime)
return my_syscall2(__NR_timer_gettime, timerid, curr_value);
-#elif defined(__NR_timer_gettime64)
+#else
struct __kernel_itimerspec kcurr_value;
int ret;
@@ -203,8 +199,6 @@ int sys_timer_gettime(timer_t timerid, struct itimerspec *curr_value)
__nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval);
__nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value);
return ret;
-#else
- return __nolibc_enosys(__func__, timerid, curr_value);
#endif
}
@@ -220,7 +214,7 @@ int sys_timer_settime(timer_t timerid, int flags,
{
#if defined(__NR_timer_settime)
return my_syscall4(__NR_timer_settime, timerid, flags, new_value, old_value);
-#elif defined(__NR_timer_settime64)
+#else
struct __kernel_itimerspec knew_value, kold_value;
int ret;
@@ -232,8 +226,6 @@ int sys_timer_settime(timer_t timerid, int flags,
__nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value);
}
return ret;
-#else
- return __nolibc_enosys(__func__, timerid, flags, new_value, old_value);
#endif
}
diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h
index 16c6e9ec9451..470a5f77bc0f 100644
--- a/tools/include/nolibc/types.h
+++ b/tools/include/nolibc/types.h
@@ -70,11 +70,6 @@
#define DT_LNK 0xa
#define DT_SOCK 0xc
-/* commonly an fd_set represents 256 FDs */
-#ifndef FD_SETSIZE
-#define FD_SETSIZE 256
-#endif
-
/* PATH_MAX and MAXPATHLEN are often used and found with plenty of different
* values.
*/
@@ -115,48 +110,6 @@
#define EXIT_SUCCESS 0
#define EXIT_FAILURE 1
-#define FD_SETIDXMASK (8 * sizeof(unsigned long))
-#define FD_SETBITMASK (8 * sizeof(unsigned long)-1)
-
-/* for select() */
-typedef struct {
- unsigned long fds[(FD_SETSIZE + FD_SETBITMASK) / FD_SETIDXMASK];
-} fd_set;
-
-#define FD_CLR(fd, set) do { \
- fd_set *__set = (set); \
- int __fd = (fd); \
- if (__fd >= 0) \
- __set->fds[__fd / FD_SETIDXMASK] &= \
- ~(1U << (__fd & FD_SETBITMASK)); \
- } while (0)
-
-#define FD_SET(fd, set) do { \
- fd_set *__set = (set); \
- int __fd = (fd); \
- if (__fd >= 0) \
- __set->fds[__fd / FD_SETIDXMASK] |= \
- 1 << (__fd & FD_SETBITMASK); \
- } while (0)
-
-#define FD_ISSET(fd, set) ({ \
- fd_set *__set = (set); \
- int __fd = (fd); \
- int __r = 0; \
- if (__fd >= 0) \
- __r = !!(__set->fds[__fd / FD_SETIDXMASK] & \
-1U << (__fd & FD_SETBITMASK)); \
- __r; \
- })
-
-#define FD_ZERO(set) do { \
- fd_set *__set = (set); \
- int __idx; \
- int __size = (FD_SETSIZE+FD_SETBITMASK) / FD_SETIDXMASK;\
- for (__idx = 0; __idx < __size; __idx++) \
- __set->fds[__idx] = 0; \
- } while (0)
-
/* for getdents64() */
struct linux_dirent64 {
uint64_t d_ino;
diff --git a/tools/include/nolibc/unistd.h b/tools/include/nolibc/unistd.h
index 7405fa2b89ba..bb5e80f3f05d 100644
--- a/tools/include/nolibc/unistd.h
+++ b/tools/include/nolibc/unistd.h
@@ -54,7 +54,7 @@ int msleep(unsigned int msecs)
{
struct timeval my_timeval = { msecs / 1000, (msecs % 1000) * 1000 };
- if (sys_select(0, 0, 0, 0, &my_timeval) < 0)
+ if (sys_select(0, NULL, NULL, NULL, &my_timeval) < 0)
return (my_timeval.tv_sec * 1000) +
(my_timeval.tv_usec / 1000) +
!!(my_timeval.tv_usec % 1000);
@@ -67,7 +67,7 @@ unsigned int sleep(unsigned int seconds)
{
struct timeval my_timeval = { seconds, 0 };
- if (sys_select(0, 0, 0, 0, &my_timeval) < 0)
+ if (sys_select(0, NULL, NULL, NULL, &my_timeval) < 0)
return my_timeval.tv_sec + !!my_timeval.tv_usec;
else
return 0;
@@ -78,7 +78,7 @@ int usleep(unsigned int usecs)
{
struct timeval my_timeval = { usecs / 1000000, usecs % 1000000 };
- return sys_select(0, 0, 0, 0, &my_timeval);
+ return sys_select(0, NULL, NULL, NULL, &my_timeval);
}
static __attribute__((unused))
diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h
index e63a71d3c607..3cd5cf15e3c9 100644
--- a/tools/include/uapi/drm/drm.h
+++ b/tools/include/uapi/drm/drm.h
@@ -597,35 +597,66 @@ struct drm_set_version {
int drm_dd_minor;
};
-/* DRM_IOCTL_GEM_CLOSE ioctl argument type */
+/**
+ * struct drm_gem_close - Argument for &DRM_IOCTL_GEM_CLOSE ioctl.
+ * @handle: Handle of the object to be closed.
+ * @pad: Padding.
+ *
+ * Releases the handle to an mm object.
+ */
struct drm_gem_close {
- /** Handle of the object to be closed. */
__u32 handle;
__u32 pad;
};
-/* DRM_IOCTL_GEM_FLINK ioctl argument type */
+/**
+ * struct drm_gem_flink - Argument for &DRM_IOCTL_GEM_FLINK ioctl.
+ * @handle: Handle for the object being named.
+ * @name: Returned global name.
+ *
+ * Create a global name for an object, returning the name.
+ *
+ * Note that the name does not hold a reference; when the object
+ * is freed, the name goes away.
+ */
struct drm_gem_flink {
- /** Handle for the object being named */
__u32 handle;
-
- /** Returned global name */
__u32 name;
};
-/* DRM_IOCTL_GEM_OPEN ioctl argument type */
+/**
+ * struct drm_gem_open - Argument for &DRM_IOCTL_GEM_OPEN ioctl.
+ * @name: Name of object being opened.
+ * @handle: Returned handle for the object.
+ * @size: Returned size of the object
+ *
+ * Open an object using the global name, returning a handle and the size.
+ *
+ * This handle (of course) holds a reference to the object, so the object
+ * will not go away until the handle is deleted.
+ */
struct drm_gem_open {
- /** Name of object being opened */
__u32 name;
-
- /** Returned handle for the object */
__u32 handle;
-
- /** Returned size of the object */
__u64 size;
};
/**
+ * struct drm_gem_change_handle - Argument for &DRM_IOCTL_GEM_CHANGE_HANDLE ioctl.
+ * @handle: The handle of a gem object.
+ * @new_handle: An available gem handle.
+ *
+ * This ioctl changes the handle of a GEM object to the specified one.
+ * The new handle must be unused. On success the old handle is closed
+ * and all further IOCTL should refer to the new handle only.
+ * Calls to DRM_IOCTL_PRIME_FD_TO_HANDLE will return the new handle.
+ */
+struct drm_gem_change_handle {
+ __u32 handle;
+ __u32 new_handle;
+};
+
+/**
* DRM_CAP_DUMB_BUFFER
*
* If set to 1, the driver supports creating dumb buffers via the
@@ -1309,6 +1340,14 @@ extern "C" {
*/
#define DRM_IOCTL_SET_CLIENT_NAME DRM_IOWR(0xD1, struct drm_set_client_name)
+/**
+ * DRM_IOCTL_GEM_CHANGE_HANDLE - Move an object to a different handle
+ *
+ * Some applications (notably CRIU) need objects to have specific gem handles.
+ * This ioctl changes the object at one gem handle to use a new gem handle.
+ */
+#define DRM_IOCTL_GEM_CHANGE_HANDLE DRM_IOWR(0xD2, struct drm_gem_change_handle)
+
/*
* Device specific ioctls should only be in their respective headers
* The device specific ioctl range is from 0x40 to 0x9f.
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index f0f0d49d2544..52f6000ab020 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -962,6 +962,7 @@ struct kvm_enable_cap {
#define KVM_CAP_ARM_EL2_E2H0 241
#define KVM_CAP_RISCV_MP_STATE_RESET 242
#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243
+#define KVM_CAP_GUEST_MEMFD_FLAGS 244
struct kvm_irq_routing_irqchip {
__u32 irqchip;
@@ -1598,6 +1599,8 @@ struct kvm_memory_attributes {
#define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
+#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0)
+#define GUEST_MEMFD_FLAG_INIT_SHARED (1ULL << 1)
struct kvm_create_guest_memfd {
__u64 size;
diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h
index 33c9b578b3b2..a25e38d1c874 100644
--- a/tools/include/uapi/linux/nsfs.h
+++ b/tools/include/uapi/linux/nsfs.h
@@ -53,6 +53,76 @@ enum init_ns_ino {
TIME_NS_INIT_INO = 0xEFFFFFFAU,
NET_NS_INIT_INO = 0xEFFFFFF9U,
MNT_NS_INIT_INO = 0xEFFFFFF8U,
+#ifdef __KERNEL__
+ MNT_NS_ANON_INO = 0xEFFFFFF7U,
+#endif
};
+struct nsfs_file_handle {
+ __u64 ns_id;
+ __u32 ns_type;
+ __u32 ns_inum;
+};
+
+#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */
+#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */
+
+enum init_ns_id {
+ IPC_NS_INIT_ID = 1ULL,
+ UTS_NS_INIT_ID = 2ULL,
+ USER_NS_INIT_ID = 3ULL,
+ PID_NS_INIT_ID = 4ULL,
+ CGROUP_NS_INIT_ID = 5ULL,
+ TIME_NS_INIT_ID = 6ULL,
+ NET_NS_INIT_ID = 7ULL,
+ MNT_NS_INIT_ID = 8ULL,
+#ifdef __KERNEL__
+ NS_LAST_INIT_ID = MNT_NS_INIT_ID,
+#endif
+};
+
+enum ns_type {
+ TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */
+ MNT_NS = (1ULL << 17), /* CLONE_NEWNS */
+ CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */
+ UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */
+ IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */
+ USER_NS = (1ULL << 28), /* CLONE_NEWUSER */
+ PID_NS = (1ULL << 29), /* CLONE_NEWPID */
+ NET_NS = (1ULL << 30), /* CLONE_NEWNET */
+};
+
+/**
+ * struct ns_id_req - namespace ID request structure
+ * @size: size of this structure
+ * @spare: reserved for future use
+ * @filter: filter mask
+ * @ns_id: last namespace id
+ * @user_ns_id: owning user namespace ID
+ *
+ * Structure for passing namespace ID and miscellaneous parameters to
+ * statns(2) and listns(2).
+ *
+ * For statns(2) @param represents the request mask.
+ * For listns(2) @param represents the last listed mount id (or zero).
+ */
+struct ns_id_req {
+ __u32 size;
+ __u32 spare;
+ __u64 ns_id;
+ struct /* listns */ {
+ __u32 ns_type;
+ __u32 spare2;
+ __u64 user_ns_id;
+ };
+};
+
+/*
+ * Special @user_ns_id value that can be passed to listns()
+ */
+#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */
+
+/* List of all ns_id_req versions. */
+#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */
+
#endif /* __LINUX_NSFS_H */
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 78a362b80027..d292f96bc06f 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -463,7 +463,9 @@ struct perf_event_attr {
inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
remove_on_exec : 1, /* event is removed from task on exec */
sigtrap : 1, /* send synchronous SIGTRAP on event */
- __reserved_1 : 26;
+ defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */
+ defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */
+ __reserved_1 : 24;
union {
__u32 wakeup_events; /* wake up every n events */
@@ -1239,6 +1241,22 @@ enum perf_event_type {
*/
PERF_RECORD_AUX_OUTPUT_HW_ID = 21,
+ /*
+ * This user callchain capture was deferred until shortly before
+ * returning to user space. Previous samples would have kernel
+ * callchains only and they need to be stitched with this to make full
+ * callchains.
+ *
+ * struct {
+ * struct perf_event_header header;
+ * u64 cookie;
+ * u64 nr;
+ * u64 ips[nr];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_CALLCHAIN_DEFERRED = 22,
+
PERF_RECORD_MAX, /* non-ABI */
};
@@ -1269,6 +1287,7 @@ enum perf_callchain_context {
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
PERF_CONTEXT_USER = (__u64)-512,
+ PERF_CONTEXT_USER_DEFERRED = (__u64)-640,
PERF_CONTEXT_GUEST = (__u64)-2048,
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,
diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index 80c028540656..d4e4e388e625 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -315,20 +315,20 @@ enum libbpf_tristate {
___param, sizeof(___param)); \
})
-extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args,
- __u32 len__sz, void *aux__prog) __weak __ksym;
-
-#define bpf_stream_printk(stream_id, fmt, args...) \
-({ \
- static const char ___fmt[] = fmt; \
- unsigned long long ___param[___bpf_narg(args)]; \
- \
- _Pragma("GCC diagnostic push") \
- _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
- ___bpf_fill(___param, args); \
- _Pragma("GCC diagnostic pop") \
- \
- bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param), NULL);\
+extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args,
+ __u32 len__sz, void *aux__prog) __weak __ksym;
+
+#define bpf_stream_printk(stream_id, fmt, args...) \
+({ \
+ static const char ___fmt[] = fmt; \
+ unsigned long long ___param[___bpf_narg(args)]; \
+ \
+ _Pragma("GCC diagnostic push") \
+ _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
+ ___bpf_fill(___param, args); \
+ _Pragma("GCC diagnostic pop") \
+ \
+ bpf_stream_vprintk_impl(stream_id, ___fmt, ___param, sizeof(___param), NULL); \
})
/* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args
diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index a8f6cd4841b0..dbe32a5d02cd 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -311,7 +311,7 @@ struct pt_regs___arm64 {
#define __PT_RET_REG regs[31]
#define __PT_FP_REG __unsupported__
#define __PT_RC_REG gpr[3]
-#define __PT_SP_REG sp
+#define __PT_SP_REG gpr[1]
#define __PT_IP_REG nip
#elif defined(bpf_target_sparc)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index dd3b2f57082d..85abc357da31 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -11325,8 +11325,6 @@ static const char *arch_specific_syscall_pfx(void)
return "ia32";
#elif defined(__s390x__)
return "s390x";
-#elif defined(__s390__)
- return "s390";
#elif defined(__arm__)
return "arm";
#elif defined(__aarch64__)
@@ -12113,8 +12111,6 @@ static const char *arch_specific_lib_paths(void)
return "/lib/i386-linux-gnu";
#elif defined(__s390x__)
return "/lib/s390x-linux-gnu";
-#elif defined(__s390__)
- return "/lib/s390-linux-gnu";
#elif defined(__arm__) && defined(__SOFTFP__)
return "/lib/arm-linux-gnueabi";
#elif defined(__arm__) && !defined(__SOFTFP__)
diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c
index c174b4086673..d1524f6f54ae 100644
--- a/tools/lib/bpf/usdt.c
+++ b/tools/lib/bpf/usdt.c
@@ -1376,8 +1376,6 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec
#elif defined(__s390x__)
-/* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */
-
static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz)
{
unsigned int reg;
diff --git a/tools/docs/lib/__init__.py b/tools/lib/python/__init__.py
index e69de29bb2d1..e69de29bb2d1 100644
--- a/tools/docs/lib/__init__.py
+++ b/tools/lib/python/__init__.py
diff --git a/tools/lib/python/abi/__init__.py b/tools/lib/python/abi/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/tools/lib/python/abi/__init__.py
diff --git a/tools/lib/python/abi/abi_parser.py b/tools/lib/python/abi/abi_parser.py
new file mode 100644
index 000000000000..9b8db70067ef
--- /dev/null
+++ b/tools/lib/python/abi/abi_parser.py
@@ -0,0 +1,628 @@
+#!/usr/bin/env python3
+# pylint: disable=R0902,R0903,R0911,R0912,R0913,R0914,R0915,R0917,C0302
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
+# SPDX-License-Identifier: GPL-2.0
+
+"""
+Parse ABI documentation and produce results from it.
+"""
+
+from argparse import Namespace
+import logging
+import os
+import re
+
+from pprint import pformat
+from random import randrange, seed
+
+# Import Python modules
+
+from abi.helpers import AbiDebug, ABI_DIR
+
+
+class AbiParser:
+ """Main class to parse ABI files"""
+
+ TAGS = r"(what|where|date|kernelversion|contact|description|users)"
+ XREF = r"(?:^|\s|\()(\/(?:sys|config|proc|dev|kvd)\/[^,.:;\)\s]+)(?:[,.:;\)\s]|\Z)"
+
+ def __init__(self, directory, logger=None,
+ enable_lineno=False, show_warnings=True, debug=0):
+ """Stores arguments for the class and initialize class vars"""
+
+ self.directory = directory
+ self.enable_lineno = enable_lineno
+ self.show_warnings = show_warnings
+ self.debug = debug
+
+ if not logger:
+ self.log = logging.getLogger("get_abi")
+ else:
+ self.log = logger
+
+ self.data = {}
+ self.what_symbols = {}
+ self.file_refs = {}
+ self.what_refs = {}
+
+ # Ignore files that contain such suffixes
+ self.ignore_suffixes = (".rej", ".org", ".orig", ".bak", "~")
+
+ # Regular expressions used on parser
+ self.re_abi_dir = re.compile(r"(.*)" + ABI_DIR)
+ self.re_tag = re.compile(r"(\S+)(:\s*)(.*)", re.I)
+ self.re_valid = re.compile(self.TAGS)
+ self.re_start_spc = re.compile(r"(\s*)(\S.*)")
+ self.re_whitespace = re.compile(r"^\s+")
+
+ # Regular used on print
+ self.re_what = re.compile(r"(\/?(?:[\w\-]+\/?){1,2})")
+ self.re_escape = re.compile(r"([\.\x01-\x08\x0e-\x1f\x21-\x2f\x3a-\x40\x7b-\xff])")
+ self.re_unprintable = re.compile(r"([\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\xff]+)")
+ self.re_title_mark = re.compile(r"\n[\-\*\=\^\~]+\n")
+ self.re_doc = re.compile(r"Documentation/(?!devicetree)(\S+)\.rst")
+ self.re_abi = re.compile(r"(Documentation/ABI/)([\w\/\-]+)")
+ self.re_xref_node = re.compile(self.XREF)
+
+ def warn(self, fdata, msg, extra=None):
+ """Displays a parse error if warning is enabled"""
+
+ if not self.show_warnings:
+ return
+
+ msg = f"{fdata.fname}:{fdata.ln}: {msg}"
+ if extra:
+ msg += "\n\t\t" + extra
+
+ self.log.warning(msg)
+
+ def add_symbol(self, what, fname, ln=None, xref=None):
+ """Create a reference table describing where each 'what' is located"""
+
+ if what not in self.what_symbols:
+ self.what_symbols[what] = {"file": {}}
+
+ if fname not in self.what_symbols[what]["file"]:
+ self.what_symbols[what]["file"][fname] = []
+
+ if ln and ln not in self.what_symbols[what]["file"][fname]:
+ self.what_symbols[what]["file"][fname].append(ln)
+
+ if xref:
+ self.what_symbols[what]["xref"] = xref
+
+ def _parse_line(self, fdata, line):
+ """Parse a single line of an ABI file"""
+
+ new_what = False
+ new_tag = False
+ content = None
+
+ match = self.re_tag.match(line)
+ if match:
+ new = match.group(1).lower()
+ sep = match.group(2)
+ content = match.group(3)
+
+ match = self.re_valid.search(new)
+ if match:
+ new_tag = match.group(1)
+ else:
+ if fdata.tag == "description":
+ # New "tag" is actually part of description.
+ # Don't consider it a tag
+ new_tag = False
+ elif fdata.tag != "":
+ self.warn(fdata, f"tag '{fdata.tag}' is invalid", line)
+
+ if new_tag:
+ # "where" is Invalid, but was a common mistake. Warn if found
+ if new_tag == "where":
+ self.warn(fdata, "tag 'Where' is invalid. Should be 'What:' instead")
+ new_tag = "what"
+
+ if new_tag == "what":
+ fdata.space = None
+
+ if content not in self.what_symbols:
+ self.add_symbol(what=content, fname=fdata.fname, ln=fdata.ln)
+
+ if fdata.tag == "what":
+ fdata.what.append(content.strip("\n"))
+ else:
+ if fdata.key:
+ if "description" not in self.data.get(fdata.key, {}):
+ self.warn(fdata, f"{fdata.key} doesn't have a description")
+
+ for w in fdata.what:
+ self.add_symbol(what=w, fname=fdata.fname,
+ ln=fdata.what_ln, xref=fdata.key)
+
+ fdata.label = content
+ new_what = True
+
+ key = "abi_" + content.lower()
+ fdata.key = self.re_unprintable.sub("_", key).strip("_")
+
+ # Avoid duplicated keys but using a defined seed, to make
+ # the namespace identical if there aren't changes at the
+ # ABI symbols
+ seed(42)
+
+ while fdata.key in self.data:
+ char = randrange(0, 51) + ord("A")
+ if char > ord("Z"):
+ char += ord("a") - ord("Z") - 1
+
+ fdata.key += chr(char)
+
+ if fdata.key and fdata.key not in self.data:
+ self.data[fdata.key] = {
+ "what": [content],
+ "file": [fdata.file_ref],
+ "path": fdata.ftype,
+ "line_no": fdata.ln,
+ }
+
+ fdata.what = self.data[fdata.key]["what"]
+
+ self.what_refs[content] = fdata.key
+ fdata.tag = new_tag
+ fdata.what_ln = fdata.ln
+
+ if fdata.nametag["what"]:
+ t = (content, fdata.key)
+ if t not in fdata.nametag["symbols"]:
+ fdata.nametag["symbols"].append(t)
+
+ return
+
+ if fdata.tag and new_tag:
+ fdata.tag = new_tag
+
+ if new_what:
+ fdata.label = ""
+
+ if "description" in self.data[fdata.key]:
+ self.data[fdata.key]["description"] += "\n\n"
+
+ if fdata.file_ref not in self.data[fdata.key]["file"]:
+ self.data[fdata.key]["file"].append(fdata.file_ref)
+
+ if self.debug == AbiDebug.WHAT_PARSING:
+ self.log.debug("what: %s", fdata.what)
+
+ if not fdata.what:
+ self.warn(fdata, "'What:' should come first:", line)
+ return
+
+ if new_tag == "description":
+ fdata.space = None
+
+ if content:
+ sep = sep.replace(":", " ")
+
+ c = " " * len(new_tag) + sep + content
+ c = c.expandtabs()
+
+ match = self.re_start_spc.match(c)
+ if match:
+ # Preserve initial spaces for the first line
+ fdata.space = match.group(1)
+ content = match.group(2) + "\n"
+
+ self.data[fdata.key][fdata.tag] = content
+
+ return
+
+ # Store any contents before tags at the database
+ if not fdata.tag and "what" in fdata.nametag:
+ fdata.nametag["description"] += line
+ return
+
+ if fdata.tag == "description":
+ content = line.expandtabs()
+
+ if self.re_whitespace.sub("", content) == "":
+ self.data[fdata.key][fdata.tag] += "\n"
+ return
+
+ if fdata.space is None:
+ match = self.re_start_spc.match(content)
+ if match:
+ # Preserve initial spaces for the first line
+ fdata.space = match.group(1)
+
+ content = match.group(2) + "\n"
+ else:
+ if content.startswith(fdata.space):
+ content = content[len(fdata.space):]
+
+ else:
+ fdata.space = ""
+
+ if fdata.tag == "what":
+ w = content.strip("\n")
+ if w:
+ self.data[fdata.key][fdata.tag].append(w)
+ else:
+ self.data[fdata.key][fdata.tag] += content
+ return
+
+ content = line.strip()
+ if fdata.tag:
+ if fdata.tag == "what":
+ w = content.strip("\n")
+ if w:
+ self.data[fdata.key][fdata.tag].append(w)
+ else:
+ self.data[fdata.key][fdata.tag] += "\n" + content.rstrip("\n")
+ return
+
+ # Everything else is error
+ if content:
+ self.warn(fdata, "Unexpected content", line)
+
+ def parse_readme(self, nametag, fname):
+ """Parse ABI README file"""
+
+ nametag["what"] = ["Introduction"]
+ nametag["path"] = "README"
+ with open(fname, "r", encoding="utf8", errors="backslashreplace") as fp:
+ for line in fp:
+ match = self.re_tag.match(line)
+ if match:
+ new = match.group(1).lower()
+
+ match = self.re_valid.search(new)
+ if match:
+ nametag["description"] += "\n:" + line
+ continue
+
+ nametag["description"] += line
+
+ def parse_file(self, fname, path, basename):
+ """Parse a single file"""
+
+ ref = f"abi_file_{path}_{basename}"
+ ref = self.re_unprintable.sub("_", ref).strip("_")
+
+ # Store per-file state into a namespace variable. This will be used
+ # by the per-line parser state machine and by the warning function.
+ fdata = Namespace
+
+ fdata.fname = fname
+ fdata.name = basename
+
+ pos = fname.find(ABI_DIR)
+ if pos > 0:
+ f = fname[pos:]
+ else:
+ f = fname
+
+ fdata.file_ref = (f, ref)
+ self.file_refs[f] = ref
+
+ fdata.ln = 0
+ fdata.what_ln = 0
+ fdata.tag = ""
+ fdata.label = ""
+ fdata.what = []
+ fdata.key = None
+ fdata.xrefs = None
+ fdata.space = None
+ fdata.ftype = path.split("/")[0]
+
+ fdata.nametag = {}
+ fdata.nametag["what"] = [f"ABI file {path}/{basename}"]
+ fdata.nametag["type"] = "File"
+ fdata.nametag["path"] = fdata.ftype
+ fdata.nametag["file"] = [fdata.file_ref]
+ fdata.nametag["line_no"] = 1
+ fdata.nametag["description"] = ""
+ fdata.nametag["symbols"] = []
+
+ self.data[ref] = fdata.nametag
+
+ if self.debug & AbiDebug.WHAT_OPEN:
+ self.log.debug("Opening file %s", fname)
+
+ if basename == "README":
+ self.parse_readme(fdata.nametag, fname)
+ return
+
+ with open(fname, "r", encoding="utf8", errors="backslashreplace") as fp:
+ for line in fp:
+ fdata.ln += 1
+
+ self._parse_line(fdata, line)
+
+ if "description" in fdata.nametag:
+ fdata.nametag["description"] = fdata.nametag["description"].lstrip("\n")
+
+ if fdata.key:
+ if "description" not in self.data.get(fdata.key, {}):
+ self.warn(fdata, f"{fdata.key} doesn't have a description")
+
+ for w in fdata.what:
+ self.add_symbol(what=w, fname=fname, xref=fdata.key)
+
+ def _parse_abi(self, root=None):
+ """Internal function to parse documentation ABI recursively"""
+
+ if not root:
+ root = self.directory
+
+ with os.scandir(root) as obj:
+ for entry in obj:
+ name = os.path.join(root, entry.name)
+
+ if entry.is_dir():
+ self._parse_abi(name)
+ continue
+
+ if not entry.is_file():
+ continue
+
+ basename = os.path.basename(name)
+
+ if basename.startswith("."):
+ continue
+
+ if basename.endswith(self.ignore_suffixes):
+ continue
+
+ path = self.re_abi_dir.sub("", os.path.dirname(name))
+
+ self.parse_file(name, path, basename)
+
+ def parse_abi(self, root=None):
+ """Parse documentation ABI"""
+
+ self._parse_abi(root)
+
+ if self.debug & AbiDebug.DUMP_ABI_STRUCTS:
+ self.log.debug(pformat(self.data))
+
+ def desc_txt(self, desc):
+ """Print description as found inside ABI files"""
+
+ desc = desc.strip(" \t\n")
+
+ return desc + "\n\n"
+
+ def xref(self, fname):
+ """
+ Converts a Documentation/ABI + basename into a ReST cross-reference
+ """
+
+ xref = self.file_refs.get(fname)
+ if not xref:
+ return None
+ else:
+ return xref
+
+ def desc_rst(self, desc):
+ """Enrich ReST output by creating cross-references"""
+
+ # Remove title markups from the description
+ # Having titles inside ABI files will only work if extra
+ # care would be taken in order to strictly follow the same
+ # level order for each markup.
+ desc = self.re_title_mark.sub("\n\n", "\n" + desc)
+ desc = desc.rstrip(" \t\n").lstrip("\n")
+
+ # Python's regex performance for non-compiled expressions is a lot
+ # than Perl, as Perl automatically caches them at their
+ # first usage. Here, we'll need to do the same, as otherwise the
+ # performance penalty is be high
+
+ new_desc = ""
+ for d in desc.split("\n"):
+ if d == "":
+ new_desc += "\n"
+ continue
+
+ # Use cross-references for doc files where needed
+ d = self.re_doc.sub(r":doc:`/\1`", d)
+
+ # Use cross-references for ABI generated docs where needed
+ matches = self.re_abi.findall(d)
+ for m in matches:
+ abi = m[0] + m[1]
+
+ xref = self.file_refs.get(abi)
+ if not xref:
+ # This may happen if ABI is on a separate directory,
+ # like parsing ABI testing and symbol is at stable.
+ # The proper solution is to move this part of the code
+ # for it to be inside sphinx/kernel_abi.py
+ self.log.info("Didn't find ABI reference for '%s'", abi)
+ else:
+ new = self.re_escape.sub(r"\\\1", m[1])
+ d = re.sub(fr"\b{abi}\b", f":ref:`{new} <{xref}>`", d)
+
+ # Seek for cross reference symbols like /sys/...
+ # Need to be careful to avoid doing it on a code block
+ if d[0] not in [" ", "\t"]:
+ matches = self.re_xref_node.findall(d)
+ for m in matches:
+ # Finding ABI here is more complex due to wildcards
+ xref = self.what_refs.get(m)
+ if xref:
+ new = self.re_escape.sub(r"\\\1", m)
+ d = re.sub(fr"\b{m}\b", f":ref:`{new} <{xref}>`", d)
+
+ new_desc += d + "\n"
+
+ return new_desc + "\n\n"
+
+ def doc(self, output_in_txt=False, show_symbols=True, show_file=True,
+ filter_path=None):
+ """Print ABI at stdout"""
+
+ part = None
+ for key, v in sorted(self.data.items(),
+ key=lambda x: (x[1].get("type", ""),
+ x[1].get("what"))):
+
+ wtype = v.get("type", "Symbol")
+ file_ref = v.get("file")
+ names = v.get("what", [""])
+
+ if wtype == "File":
+ if not show_file:
+ continue
+ else:
+ if not show_symbols:
+ continue
+
+ if filter_path:
+ if v.get("path") != filter_path:
+ continue
+
+ msg = ""
+
+ if wtype != "File":
+ cur_part = names[0]
+ if cur_part.find("/") >= 0:
+ match = self.re_what.match(cur_part)
+ if match:
+ symbol = match.group(1).rstrip("/")
+ cur_part = "Symbols under " + symbol
+
+ if cur_part and cur_part != part:
+ part = cur_part
+ msg += part + "\n"+ "-" * len(part) +"\n\n"
+
+ msg += f".. _{key}:\n\n"
+
+ max_len = 0
+ for i in range(0, len(names)): # pylint: disable=C0200
+ names[i] = "**" + self.re_escape.sub(r"\\\1", names[i]) + "**"
+
+ max_len = max(max_len, len(names[i]))
+
+ msg += "+-" + "-" * max_len + "-+\n"
+ for name in names:
+ msg += f"| {name}" + " " * (max_len - len(name)) + " |\n"
+ msg += "+-" + "-" * max_len + "-+\n"
+ msg += "\n"
+
+ for ref in file_ref:
+ if wtype == "File":
+ msg += f".. _{ref[1]}:\n\n"
+ else:
+ base = os.path.basename(ref[0])
+ msg += f"Defined on file :ref:`{base} <{ref[1]}>`\n\n"
+
+ if wtype == "File":
+ msg += names[0] +"\n" + "-" * len(names[0]) +"\n\n"
+
+ desc = v.get("description")
+ if not desc and wtype != "File":
+ msg += f"DESCRIPTION MISSING for {names[0]}\n\n"
+
+ if desc:
+ if output_in_txt:
+ msg += self.desc_txt(desc)
+ else:
+ msg += self.desc_rst(desc)
+
+ symbols = v.get("symbols")
+ if symbols:
+ msg += "Has the following ABI:\n\n"
+
+ for w, label in symbols:
+ # Escape special chars from content
+ content = self.re_escape.sub(r"\\\1", w)
+
+ msg += f"- :ref:`{content} <{label}>`\n\n"
+
+ users = v.get("users")
+ if users and users.strip(" \t\n"):
+ users = users.strip("\n").replace('\n', '\n\t')
+ msg += f"Users:\n\t{users}\n\n"
+
+ ln = v.get("line_no", 1)
+
+ yield (msg, file_ref[0][0], ln)
+
+ def check_issues(self):
+ """Warn about duplicated ABI entries"""
+
+ for what, v in self.what_symbols.items():
+ files = v.get("file")
+ if not files:
+ # Should never happen if the parser works properly
+ self.log.warning("%s doesn't have a file associated", what)
+ continue
+
+ if len(files) == 1:
+ continue
+
+ f = []
+ for fname, lines in sorted(files.items()):
+ if not lines:
+ f.append(f"{fname}")
+ elif len(lines) == 1:
+ f.append(f"{fname}:{lines[0]}")
+ else:
+ m = fname + "lines "
+ m += ", ".join(str(x) for x in lines)
+ f.append(m)
+
+ self.log.warning("%s is defined %d times: %s", what, len(f), "; ".join(f))
+
+ def search_symbols(self, expr):
+ """ Searches for ABI symbols """
+
+ regex = re.compile(expr, re.I)
+
+ found_keys = 0
+ for t in sorted(self.data.items(), key=lambda x: [0]):
+ v = t[1]
+
+ wtype = v.get("type", "")
+ if wtype == "File":
+ continue
+
+ for what in v.get("what", [""]):
+ if regex.search(what):
+ found_keys += 1
+
+ kernelversion = v.get("kernelversion", "").strip(" \t\n")
+ date = v.get("date", "").strip(" \t\n")
+ contact = v.get("contact", "").strip(" \t\n")
+ users = v.get("users", "").strip(" \t\n")
+ desc = v.get("description", "").strip(" \t\n")
+
+ files = []
+ for f in v.get("file", ()):
+ files.append(f[0])
+
+ what = str(found_keys) + ". " + what
+ title_tag = "-" * len(what)
+
+ print(f"\n{what}\n{title_tag}\n")
+
+ if kernelversion:
+ print(f"Kernel version:\t\t{kernelversion}")
+
+ if date:
+ print(f"Date:\t\t\t{date}")
+
+ if contact:
+ print(f"Contact:\t\t{contact}")
+
+ if users:
+ print(f"Users:\t\t\t{users}")
+
+ print("Defined on file(s):\t" + ", ".join(files))
+
+ if desc:
+ desc = desc.strip("\n")
+ print(f"\n{desc}\n")
+
+ if not found_keys:
+ print(f"Regular expression /{expr}/ not found.")
diff --git a/tools/lib/python/abi/abi_regex.py b/tools/lib/python/abi/abi_regex.py
new file mode 100644
index 000000000000..d5553206de3c
--- /dev/null
+++ b/tools/lib/python/abi/abi_regex.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+# xxpylint: disable=R0903
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
+# SPDX-License-Identifier: GPL-2.0
+
+"""
+Convert ABI what into regular expressions
+"""
+
+import re
+import sys
+
+from pprint import pformat
+
+from abi.abi_parser import AbiParser
+from abi.helpers import AbiDebug
+
+class AbiRegex(AbiParser):
+ """Extends AbiParser to search ABI nodes with regular expressions"""
+
+ # Escape only ASCII visible characters
+ escape_symbols = r"([\x21-\x29\x2b-\x2d\x3a-\x40\x5c\x60\x7b-\x7e])"
+ leave_others = "others"
+
+ # Tuples with regular expressions to be compiled and replacement data
+ re_whats = [
+ # Drop escape characters that might exist
+ (re.compile("\\\\"), ""),
+
+ # Temporarily escape dot characters
+ (re.compile(r"\."), "\xf6"),
+
+ # Temporarily change [0-9]+ type of patterns
+ (re.compile(r"\[0\-9\]\+"), "\xff"),
+
+ # Temporarily change [\d+-\d+] type of patterns
+ (re.compile(r"\[0\-\d+\]"), "\xff"),
+ (re.compile(r"\[0:\d+\]"), "\xff"),
+ (re.compile(r"\[(\d+)\]"), "\xf4\\\\d+\xf5"),
+
+ # Temporarily change [0-9] type of patterns
+ (re.compile(r"\[(\d)\-(\d)\]"), "\xf4\1-\2\xf5"),
+
+ # Handle multiple option patterns
+ (re.compile(r"[\{\<\[]([\w_]+)(?:[,|]+([\w_]+)){1,}[\}\>\]]"), r"(\1|\2)"),
+
+ # Handle wildcards
+ (re.compile(r"([^\/])\*"), "\\1\\\\w\xf7"),
+ (re.compile(r"/\*/"), "/.*/"),
+ (re.compile(r"/\xf6\xf6\xf6"), "/.*"),
+ (re.compile(r"\<[^\>]+\>"), "\\\\w\xf7"),
+ (re.compile(r"\{[^\}]+\}"), "\\\\w\xf7"),
+ (re.compile(r"\[[^\]]+\]"), "\\\\w\xf7"),
+
+ (re.compile(r"XX+"), "\\\\w\xf7"),
+ (re.compile(r"([^A-Z])[XYZ]([^A-Z])"), "\\1\\\\w\xf7\\2"),
+ (re.compile(r"([^A-Z])[XYZ]$"), "\\1\\\\w\xf7"),
+ (re.compile(r"_[AB]_"), "_\\\\w\xf7_"),
+
+ # Recover [0-9] type of patterns
+ (re.compile(r"\xf4"), "["),
+ (re.compile(r"\xf5"), "]"),
+
+ # Remove duplicated spaces
+ (re.compile(r"\s+"), r" "),
+
+ # Special case: drop comparison as in:
+ # What: foo = <something>
+ # (this happens on a few IIO definitions)
+ (re.compile(r"\s*\=.*$"), ""),
+
+ # Escape all other symbols
+ (re.compile(escape_symbols), r"\\\1"),
+ (re.compile(r"\\\\"), r"\\"),
+ (re.compile(r"\\([\[\]\(\)\|])"), r"\1"),
+ (re.compile(r"(\d+)\\(-\d+)"), r"\1\2"),
+
+ (re.compile(r"\xff"), r"\\d+"),
+
+ # Special case: IIO ABI which a parenthesis.
+ (re.compile(r"sqrt(.*)"), r"sqrt(.*)"),
+
+ # Simplify regexes with multiple .*
+ (re.compile(r"(?:\.\*){2,}"), ""),
+
+ # Recover dot characters
+ (re.compile(r"\xf6"), "\\."),
+ # Recover plus characters
+ (re.compile(r"\xf7"), "+"),
+ ]
+ re_has_num = re.compile(r"\\d")
+
+ # Symbol name after escape_chars that are considered a devnode basename
+ re_symbol_name = re.compile(r"(\w|\\[\.\-\:])+$")
+
+ # List of popular group names to be skipped to minimize regex group size
+ # Use AbiDebug.SUBGROUP_SIZE to detect those
+ skip_names = set(["devices", "hwmon"])
+
+ def regex_append(self, what, new):
+ """
+ Get a search group for a subset of regular expressions.
+
+ As ABI may have thousands of symbols, using a for to search all
+ regular expressions is at least O(n^2). When there are wildcards,
+ the complexity increases substantially, eventually becoming exponential.
+
+ To avoid spending too much time on them, use a logic to split
+ them into groups. The smaller the group, the better, as it would
+ mean that searches will be confined to a small number of regular
+ expressions.
+
+ The conversion to a regex subset is tricky, as we need something
+ that can be easily obtained from the sysfs symbol and from the
+ regular expression. So, we need to discard nodes that have
+ wildcards.
+
+ If it can't obtain a subgroup, place the regular expression inside
+ a special group (self.leave_others).
+ """
+
+ search_group = None
+
+ for search_group in reversed(new.split("/")):
+ if not search_group or search_group in self.skip_names:
+ continue
+ if self.re_symbol_name.match(search_group):
+ break
+
+ if not search_group:
+ search_group = self.leave_others
+
+ if self.debug & AbiDebug.SUBGROUP_MAP:
+ self.log.debug("%s: mapped as %s", what, search_group)
+
+ try:
+ if search_group not in self.regex_group:
+ self.regex_group[search_group] = []
+
+ self.regex_group[search_group].append(re.compile(new))
+ if self.search_string:
+ if what.find(self.search_string) >= 0:
+ print(f"What: {what}")
+ except re.PatternError:
+ self.log.warning("Ignoring '%s' as it produced an invalid regex:\n"
+ " '%s'", what, new)
+
+ def get_regexes(self, what):
+ """
+ Given an ABI devnode, return a list of all regular expressions that
+ may match it, based on the sub-groups created by regex_append()
+ """
+
+ re_list = []
+
+ patches = what.split("/")
+ patches.reverse()
+ patches.append(self.leave_others)
+
+ for search_group in patches:
+ if search_group in self.regex_group:
+ re_list += self.regex_group[search_group]
+
+ return re_list
+
+ def __init__(self, *args, **kwargs):
+ """
+ Override init method to get verbose argument
+ """
+
+ self.regex_group = None
+ self.search_string = None
+ self.re_string = None
+
+ if "search_string" in kwargs:
+ self.search_string = kwargs.get("search_string")
+ del kwargs["search_string"]
+
+ if self.search_string:
+
+ try:
+ self.re_string = re.compile(self.search_string)
+ except re.PatternError as e:
+ msg = f"{self.search_string} is not a valid regular expression"
+ raise ValueError(msg) from e
+
+ super().__init__(*args, **kwargs)
+
+ def parse_abi(self, *args, **kwargs):
+
+ super().parse_abi(*args, **kwargs)
+
+ self.regex_group = {}
+
+ print("Converting ABI What fields into regexes...", file=sys.stderr)
+
+ for t in sorted(self.data.items(), key=lambda x: x[0]):
+ v = t[1]
+ if v.get("type") == "File":
+ continue
+
+ v["regex"] = []
+
+ for what in v.get("what", []):
+ if not what.startswith("/sys"):
+ continue
+
+ new = what
+ for r, s in self.re_whats:
+ try:
+ new = r.sub(s, new)
+ except re.PatternError as e:
+ # Help debugging troubles with new regexes
+ raise re.PatternError(f"{e}\nwhile re.sub('{r.pattern}', {s}, str)") from e
+
+ v["regex"].append(new)
+
+ if self.debug & AbiDebug.REGEX:
+ self.log.debug("%-90s <== %s", new, what)
+
+ # Store regex into a subgroup to speedup searches
+ self.regex_append(what, new)
+
+ if self.debug & AbiDebug.SUBGROUP_DICT:
+ self.log.debug("%s", pformat(self.regex_group))
+
+ if self.debug & AbiDebug.SUBGROUP_SIZE:
+ biggestd_keys = sorted(self.regex_group.keys(),
+ key= lambda k: len(self.regex_group[k]),
+ reverse=True)
+
+ print("Top regex subgroups:", file=sys.stderr)
+ for k in biggestd_keys[:10]:
+ print(f"{k} has {len(self.regex_group[k])} elements", file=sys.stderr)
diff --git a/tools/lib/python/abi/helpers.py b/tools/lib/python/abi/helpers.py
new file mode 100644
index 000000000000..639b23e4ca33
--- /dev/null
+++ b/tools/lib/python/abi/helpers.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
+# pylint: disable=R0903
+# SPDX-License-Identifier: GPL-2.0
+
+"""
+Helper classes for ABI parser
+"""
+
+ABI_DIR = "Documentation/ABI/"
+
+
+class AbiDebug:
+ """Debug levels"""
+
+ WHAT_PARSING = 1
+ WHAT_OPEN = 2
+ DUMP_ABI_STRUCTS = 4
+ UNDEFINED = 8
+ REGEX = 16
+ SUBGROUP_MAP = 32
+ SUBGROUP_DICT = 64
+ SUBGROUP_SIZE = 128
+ GRAPH = 256
+
+
+DEBUG_HELP = """
+1 - enable debug parsing logic
+2 - enable debug messages on file open
+4 - enable debug for ABI parse data
+8 - enable extra debug information to identify troubles
+ with ABI symbols found at the local machine that
+ weren't found on ABI documentation (used only for
+ undefined subcommand)
+16 - enable debug for what to regex conversion
+32 - enable debug for symbol regex subgroups
+64 - enable debug for sysfs graph tree variable
+"""
diff --git a/tools/lib/python/abi/system_symbols.py b/tools/lib/python/abi/system_symbols.py
new file mode 100644
index 000000000000..4a2554da217b
--- /dev/null
+++ b/tools/lib/python/abi/system_symbols.py
@@ -0,0 +1,378 @@
+#!/usr/bin/env python3
+# pylint: disable=R0902,R0912,R0914,R0915,R1702
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
+# SPDX-License-Identifier: GPL-2.0
+
+"""
+Parse ABI documentation and produce results from it.
+"""
+
+import os
+import re
+import sys
+
+from concurrent import futures
+from datetime import datetime
+from random import shuffle
+
+from abi.helpers import AbiDebug
+
+class SystemSymbols:
+ """Stores arguments for the class and initialize class vars"""
+
+ def graph_add_file(self, path, link=None):
+ """
+ add a file path to the sysfs graph stored at self.root
+ """
+
+ if path in self.files:
+ return
+
+ name = ""
+ ref = self.root
+ for edge in path.split("/"):
+ name += edge + "/"
+ if edge not in ref:
+ ref[edge] = {"__name": [name.rstrip("/")]}
+
+ ref = ref[edge]
+
+ if link and link not in ref["__name"]:
+ ref["__name"].append(link.rstrip("/"))
+
+ self.files.add(path)
+
+ def print_graph(self, root_prefix="", root=None, level=0):
+ """Prints a reference tree graph using UTF-8 characters"""
+
+ if not root:
+ root = self.root
+ level = 0
+
+ # Prevent endless traverse
+ if level > 5:
+ return
+
+ if level > 0:
+ prefix = "├──"
+ last_prefix = "└──"
+ else:
+ prefix = ""
+ last_prefix = ""
+
+ items = list(root.items())
+
+ names = root.get("__name", [])
+ for k, edge in items:
+ if k == "__name":
+ continue
+
+ if not k:
+ k = "/"
+
+ if len(names) > 1:
+ k += " links: " + ",".join(names[1:])
+
+ if edge == items[-1][1]:
+ print(root_prefix + last_prefix + k)
+ p = root_prefix
+ if level > 0:
+ p += " "
+ self.print_graph(p, edge, level + 1)
+ else:
+ print(root_prefix + prefix + k)
+ p = root_prefix + "│ "
+ self.print_graph(p, edge, level + 1)
+
+ def _walk(self, root):
+ """
+ Walk through sysfs to get all devnodes that aren't ignored.
+
+ By default, uses /sys as sysfs mounting point. If another
+ directory is used, it replaces them to /sys at the patches.
+ """
+
+ with os.scandir(root) as obj:
+ for entry in obj:
+ path = os.path.join(root, entry.name)
+ if self.sysfs:
+ p = path.replace(self.sysfs, "/sys", count=1)
+ else:
+ p = path
+
+ if self.re_ignore.search(p):
+ return
+
+ # Handle link first to avoid directory recursion
+ if entry.is_symlink():
+ real = os.path.realpath(path)
+ if not self.sysfs:
+ self.aliases[path] = real
+ else:
+ real = real.replace(self.sysfs, "/sys", count=1)
+
+ # Add absfile location to graph if it doesn't exist
+ if not self.re_ignore.search(real):
+ # Add link to the graph
+ self.graph_add_file(real, p)
+
+ elif entry.is_file():
+ self.graph_add_file(p)
+
+ elif entry.is_dir():
+ self._walk(path)
+
+ def __init__(self, abi, sysfs="/sys", hints=False):
+ """
+ Initialize internal variables and get a list of all files inside
+ sysfs that can currently be parsed.
+
+ Please notice that there are several entries on sysfs that aren't
+ documented as ABI. Ignore those.
+
+ The real paths will be stored under self.files. Aliases will be
+ stored in separate, as self.aliases.
+ """
+
+ self.abi = abi
+ self.log = abi.log
+
+ if sysfs != "/sys":
+ self.sysfs = sysfs.rstrip("/")
+ else:
+ self.sysfs = None
+
+ self.hints = hints
+
+ self.root = {}
+ self.aliases = {}
+ self.files = set()
+
+ dont_walk = [
+ # Those require root access and aren't documented at ABI
+ f"^{sysfs}/kernel/debug",
+ f"^{sysfs}/kernel/tracing",
+ f"^{sysfs}/fs/pstore",
+ f"^{sysfs}/fs/bpf",
+ f"^{sysfs}/fs/fuse",
+
+ # This is not documented at ABI
+ f"^{sysfs}/module",
+
+ f"^{sysfs}/fs/cgroup", # this is big and has zero docs under ABI
+ f"^{sysfs}/firmware", # documented elsewhere: ACPI, DT bindings
+ "sections|notes", # aren't actually part of ABI
+
+ # kernel-parameters.txt - not easy to parse
+ "parameters",
+ ]
+
+ self.re_ignore = re.compile("|".join(dont_walk))
+
+ print(f"Reading {sysfs} directory contents...", file=sys.stderr)
+ self._walk(sysfs)
+
+ def check_file(self, refs, found):
+ """Check missing ABI symbols for a given sysfs file"""
+
+ res_list = []
+
+ try:
+ for names in refs:
+ fname = names[0]
+
+ res = {
+ "found": False,
+ "fname": fname,
+ "msg": "",
+ }
+ res_list.append(res)
+
+ re_what = self.abi.get_regexes(fname)
+ if not re_what:
+ self.abi.log.warning(f"missing rules for {fname}")
+ continue
+
+ for name in names:
+ for r in re_what:
+ if self.abi.debug & AbiDebug.UNDEFINED:
+ self.log.debug("check if %s matches '%s'", name, r.pattern)
+ if r.match(name):
+ res["found"] = True
+ if found:
+ res["msg"] += f" {fname}: regex:\n\t"
+ continue
+
+ if self.hints and not res["found"]:
+ res["msg"] += f" {fname} not found. Tested regexes:\n"
+ for r in re_what:
+ res["msg"] += " " + r.pattern + "\n"
+
+ except KeyboardInterrupt:
+ pass
+
+ return res_list
+
+ def _ref_interactor(self, root):
+ """Recursive function to interact over the sysfs tree"""
+
+ for k, v in root.items():
+ if isinstance(v, dict):
+ yield from self._ref_interactor(v)
+
+ if root == self.root or k == "__name":
+ continue
+
+ if self.abi.re_string:
+ fname = v["__name"][0]
+ if self.abi.re_string.search(fname):
+ yield v
+ else:
+ yield v
+
+
+ def get_fileref(self, all_refs, chunk_size):
+ """Interactor to group refs into chunks"""
+
+ n = 0
+ refs = []
+
+ for ref in all_refs:
+ refs.append(ref)
+
+ n += 1
+ if n >= chunk_size:
+ yield refs
+ n = 0
+ refs = []
+
+ yield refs
+
+ def check_undefined_symbols(self, max_workers=None, chunk_size=50,
+ found=None, dry_run=None):
+ """Seach ABI for sysfs symbols missing documentation"""
+
+ self.abi.parse_abi()
+
+ if self.abi.debug & AbiDebug.GRAPH:
+ self.print_graph()
+
+ all_refs = []
+ for ref in self._ref_interactor(self.root):
+ all_refs.append(ref["__name"])
+
+ if dry_run:
+ print("Would check", file=sys.stderr)
+ for ref in all_refs:
+ print(", ".join(ref))
+
+ return
+
+ print("Starting to search symbols (it may take several minutes):",
+ file=sys.stderr)
+ start = datetime.now()
+ old_elapsed = None
+
+ # Python doesn't support multithreading due to limitations on its
+ # global lock (GIL). While Python 3.13 finally made GIL optional,
+ # there are still issues related to it. Also, we want to have
+ # backward compatibility with older versions of Python.
+ #
+ # So, use instead multiprocess. However, Python is very slow passing
+ # data from/to multiple processes. Also, it may consume lots of memory
+ # if the data to be shared is not small. So, we need to group workload
+ # in chunks that are big enough to generate performance gains while
+ # not being so big that would cause out-of-memory.
+
+ num_refs = len(all_refs)
+ print(f"Number of references to parse: {num_refs}", file=sys.stderr)
+
+ if not max_workers:
+ max_workers = os.cpu_count()
+ elif max_workers > os.cpu_count():
+ max_workers = os.cpu_count()
+
+ max_workers = max(max_workers, 1)
+
+ max_chunk_size = int((num_refs + max_workers - 1) / max_workers)
+ chunk_size = min(chunk_size, max_chunk_size)
+ chunk_size = max(1, chunk_size)
+
+ if max_workers > 1:
+ executor = futures.ProcessPoolExecutor
+
+ # Place references in a random order. This may help improving
+ # performance, by mixing complex/simple expressions when creating
+ # chunks
+ shuffle(all_refs)
+ else:
+ # Python has a high overhead with processes. When there's just
+ # one worker, it is faster to not create a new process.
+ # Yet, User still deserves to have a progress print. So, use
+ # python's "thread", which is actually a single process, using
+ # an internal schedule to switch between tasks. No performance
+ # gains for non-IO tasks, but still it can be quickly interrupted
+ # from time to time to display progress.
+ executor = futures.ThreadPoolExecutor
+
+ not_found = []
+ f_list = []
+ with executor(max_workers=max_workers) as exe:
+ for refs in self.get_fileref(all_refs, chunk_size):
+ if refs:
+ try:
+ f_list.append(exe.submit(self.check_file, refs, found))
+
+ except KeyboardInterrupt:
+ return
+
+ total = len(f_list)
+
+ if not total:
+ if self.abi.re_string:
+ print(f"No ABI symbol matches {self.abi.search_string}")
+ else:
+ self.abi.log.warning("No ABI symbols found")
+ return
+
+ print(f"{len(f_list):6d} jobs queued on {max_workers} workers",
+ file=sys.stderr)
+
+ while f_list:
+ try:
+ t = futures.wait(f_list, timeout=1,
+ return_when=futures.FIRST_COMPLETED)
+
+ done = t[0]
+
+ for fut in done:
+ res_list = fut.result()
+
+ for res in res_list:
+ if not res["found"]:
+ not_found.append(res["fname"])
+ if res["msg"]:
+ print(res["msg"])
+
+ f_list.remove(fut)
+ except KeyboardInterrupt:
+ return
+
+ except RuntimeError as e:
+ self.abi.log.warning(f"Future: {e}")
+ break
+
+ if sys.stderr.isatty():
+ elapsed = str(datetime.now() - start).split(".", maxsplit=1)[0]
+ if len(f_list) < total:
+ elapsed += f" ({total - len(f_list)}/{total} jobs completed). "
+ if elapsed != old_elapsed:
+ print(elapsed + "\r", end="", flush=True,
+ file=sys.stderr)
+ old_elapsed = elapsed
+
+ elapsed = str(datetime.now() - start).split(".", maxsplit=1)[0]
+ print(elapsed, file=sys.stderr)
+
+ for f in sorted(not_found):
+ print(f"{f} not found.")
diff --git a/tools/lib/python/feat/parse_features.py b/tools/lib/python/feat/parse_features.py
new file mode 100755
index 000000000000..b88c04d3e2fe
--- /dev/null
+++ b/tools/lib/python/feat/parse_features.py
@@ -0,0 +1,494 @@
+#!/usr/bin/env python3
+# pylint: disable=R0902,R0911,R0912,R0914,R0915
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
+# SPDX-License-Identifier: GPL-2.0
+
+
+"""
+Library to parse the Linux Feature files and produce a ReST book.
+"""
+
+import os
+import re
+import sys
+
+from glob import iglob
+
+
+class ParseFeature:
+ """
+ Parses Documentation/features, allowing to generate ReST documentation
+ from it.
+ """
+
+ h_name = "Feature"
+ h_kconfig = "Kconfig"
+ h_description = "Description"
+ h_subsys = "Subsystem"
+ h_status = "Status"
+ h_arch = "Architecture"
+
+ # Sort order for status. Others will be mapped at the end.
+ status_map = {
+ "ok": 0,
+ "TODO": 1,
+ "N/A": 2,
+ # The only missing status is "..", which was mapped as "---",
+ # as this is an special ReST cell value. Let it get the
+ # default order (99).
+ }
+
+ def __init__(self, prefix, debug=0, enable_fname=False):
+ """
+ Sets internal variables
+ """
+
+ self.prefix = prefix
+ self.debug = debug
+ self.enable_fname = enable_fname
+
+ self.data = {}
+
+ # Initial maximum values use just the headers
+ self.max_size_name = len(self.h_name)
+ self.max_size_kconfig = len(self.h_kconfig)
+ self.max_size_description = len(self.h_description)
+ self.max_size_desc_word = 0
+ self.max_size_subsys = len(self.h_subsys)
+ self.max_size_status = len(self.h_status)
+ self.max_size_arch = len(self.h_arch)
+ self.max_size_arch_with_header = self.max_size_arch + self.max_size_arch
+ self.description_size = 1
+
+ self.msg = ""
+
+ def emit(self, msg="", end="\n"):
+ self.msg += msg + end
+
+ def parse_error(self, fname, ln, msg, data=None):
+ """
+ Displays an error message, printing file name and line
+ """
+
+ if ln:
+ fname += f"#{ln}"
+
+ print(f"Warning: file {fname}: {msg}", file=sys.stderr, end="")
+
+ if data:
+ data = data.rstrip()
+ print(f":\n\t{data}", file=sys.stderr)
+ else:
+ print("", file=sys.stderr)
+
+ def parse_feat_file(self, fname):
+ """Parses a single arch-support.txt feature file"""
+
+ if os.path.isdir(fname):
+ return
+
+ base = os.path.basename(fname)
+
+ if base != "arch-support.txt":
+ if self.debug:
+ print(f"ignoring {fname}", file=sys.stderr)
+ return
+
+ subsys = os.path.dirname(fname).split("/")[-2]
+ self.max_size_subsys = max(self.max_size_subsys, len(subsys))
+
+ feature_name = ""
+ kconfig = ""
+ description = ""
+ comments = ""
+ arch_table = {}
+
+ if self.debug > 1:
+ print(f"Opening {fname}", file=sys.stderr)
+
+ if self.enable_fname:
+ full_fname = os.path.abspath(fname)
+ self.emit(f".. FILE {full_fname}")
+
+ with open(fname, encoding="utf-8") as f:
+ for ln, line in enumerate(f, start=1):
+ line = line.strip()
+
+ match = re.match(r"^\#\s+Feature\s+name:\s*(.*\S)", line)
+ if match:
+ feature_name = match.group(1)
+
+ self.max_size_name = max(self.max_size_name,
+ len(feature_name))
+ continue
+
+ match = re.match(r"^\#\s+Kconfig:\s*(.*\S)", line)
+ if match:
+ kconfig = match.group(1)
+
+ self.max_size_kconfig = max(self.max_size_kconfig,
+ len(kconfig))
+ continue
+
+ match = re.match(r"^\#\s+description:\s*(.*\S)", line)
+ if match:
+ description = match.group(1)
+
+ self.max_size_description = max(self.max_size_description,
+ len(description))
+
+ words = re.split(r"\s+", line)[1:]
+ for word in words:
+ self.max_size_desc_word = max(self.max_size_desc_word,
+ len(word))
+
+ continue
+
+ if re.search(r"^\\s*$", line):
+ continue
+
+ if re.match(r"^\s*\-+\s*$", line):
+ continue
+
+ if re.search(r"^\s*\|\s*arch\s*\|\s*status\s*\|\s*$", line):
+ continue
+
+ match = re.match(r"^\#\s*(.*)$", line)
+ if match:
+ comments += match.group(1)
+ continue
+
+ match = re.match(r"^\s*\|\s*(\S+):\s*\|\s*(\S+)\s*\|\s*$", line)
+ if match:
+ arch = match.group(1)
+ status = match.group(2)
+
+ self.max_size_status = max(self.max_size_status,
+ len(status))
+ self.max_size_arch = max(self.max_size_arch, len(arch))
+
+ if status == "..":
+ status = "---"
+
+ arch_table[arch] = status
+
+ continue
+
+ self.parse_error(fname, ln, "Line is invalid", line)
+
+ if not feature_name:
+ self.parse_error(fname, 0, "Feature name not found")
+ return
+ if not subsys:
+ self.parse_error(fname, 0, "Subsystem not found")
+ return
+ if not kconfig:
+ self.parse_error(fname, 0, "Kconfig not found")
+ return
+ if not description:
+ self.parse_error(fname, 0, "Description not found")
+ return
+ if not arch_table:
+ self.parse_error(fname, 0, "Architecture table not found")
+ return
+
+ self.data[feature_name] = {
+ "where": fname,
+ "subsys": subsys,
+ "kconfig": kconfig,
+ "description": description,
+ "comments": comments,
+ "table": arch_table,
+ }
+
+ self.max_size_arch_with_header = self.max_size_arch + len(self.h_arch)
+
+ def parse(self):
+ """Parses all arch-support.txt feature files inside self.prefix"""
+
+ path = os.path.expanduser(self.prefix)
+
+ if self.debug > 2:
+ print(f"Running parser for {path}")
+
+ example_path = os.path.join(path, "arch-support.txt")
+
+ for fname in iglob(os.path.join(path, "**"), recursive=True):
+ if fname != example_path:
+ self.parse_feat_file(fname)
+
+ return self.data
+
+ def output_arch_table(self, arch, feat=None):
+ """
+ Output feature(s) for a given architecture.
+ """
+
+ title = f"Feature status on {arch} architecture"
+
+ self.emit("=" * len(title))
+ self.emit(title)
+ self.emit("=" * len(title))
+ self.emit()
+
+ self.emit("=" * self.max_size_subsys + " ", end="")
+ self.emit("=" * self.max_size_name + " ", end="")
+ self.emit("=" * self.max_size_kconfig + " ", end="")
+ self.emit("=" * self.max_size_status + " ", end="")
+ self.emit("=" * self.max_size_description)
+
+ self.emit(f"{self.h_subsys:<{self.max_size_subsys}} ", end="")
+ self.emit(f"{self.h_name:<{self.max_size_name}} ", end="")
+ self.emit(f"{self.h_kconfig:<{self.max_size_kconfig}} ", end="")
+ self.emit(f"{self.h_status:<{self.max_size_status}} ", end="")
+ self.emit(f"{self.h_description:<{self.max_size_description}}")
+
+ self.emit("=" * self.max_size_subsys + " ", end="")
+ self.emit("=" * self.max_size_name + " ", end="")
+ self.emit("=" * self.max_size_kconfig + " ", end="")
+ self.emit("=" * self.max_size_status + " ", end="")
+ self.emit("=" * self.max_size_description)
+
+ sorted_features = sorted(self.data.keys(),
+ key=lambda x: (self.data[x]["subsys"],
+ x.lower()))
+
+ for name in sorted_features:
+ if feat and name != feat:
+ continue
+
+ arch_table = self.data[name]["table"]
+
+ if not arch in arch_table:
+ continue
+
+ self.emit(f"{self.data[name]['subsys']:<{self.max_size_subsys}} ",
+ end="")
+ self.emit(f"{name:<{self.max_size_name}} ", end="")
+ self.emit(f"{self.data[name]['kconfig']:<{self.max_size_kconfig}} ",
+ end="")
+ self.emit(f"{arch_table[arch]:<{self.max_size_status}} ",
+ end="")
+ self.emit(f"{self.data[name]['description']}")
+
+ self.emit("=" * self.max_size_subsys + " ", end="")
+ self.emit("=" * self.max_size_name + " ", end="")
+ self.emit("=" * self.max_size_kconfig + " ", end="")
+ self.emit("=" * self.max_size_status + " ", end="")
+ self.emit("=" * self.max_size_description)
+
+ return self.msg
+
+ def output_feature(self, feat):
+ """
+ Output a feature on all architectures
+ """
+
+ title = f"Feature {feat}"
+
+ self.emit("=" * len(title))
+ self.emit(title)
+ self.emit("=" * len(title))
+ self.emit()
+
+ if not feat in self.data:
+ return
+
+ if self.data[feat]["subsys"]:
+ self.emit(f":Subsystem: {self.data[feat]['subsys']}")
+ if self.data[feat]["kconfig"]:
+ self.emit(f":Kconfig: {self.data[feat]['kconfig']}")
+
+ desc = self.data[feat]["description"]
+ desc = desc[0].upper() + desc[1:]
+ desc = desc.rstrip(". \t")
+ self.emit(f"\n{desc}.\n")
+
+ com = self.data[feat]["comments"].strip()
+ if com:
+ self.emit("Comments")
+ self.emit("--------")
+ self.emit(f"\n{com}\n")
+
+ self.emit("=" * self.max_size_arch + " ", end="")
+ self.emit("=" * self.max_size_status)
+
+ self.emit(f"{self.h_arch:<{self.max_size_arch}} ", end="")
+ self.emit(f"{self.h_status:<{self.max_size_status}}")
+
+ self.emit("=" * self.max_size_arch + " ", end="")
+ self.emit("=" * self.max_size_status)
+
+ arch_table = self.data[feat]["table"]
+ for arch in sorted(arch_table.keys()):
+ self.emit(f"{arch:<{self.max_size_arch}} ", end="")
+ self.emit(f"{arch_table[arch]:<{self.max_size_status}}")
+
+ self.emit("=" * self.max_size_arch + " ", end="")
+ self.emit("=" * self.max_size_status)
+
+ return self.msg
+
+ def matrix_lines(self, desc_size, max_size_status, header):
+ """
+ Helper function to split element tables at the output matrix
+ """
+
+ if header:
+ ln_marker = "="
+ else:
+ ln_marker = "-"
+
+ self.emit("+" + ln_marker * self.max_size_name + "+", end="")
+ self.emit(ln_marker * desc_size, end="")
+ self.emit("+" + ln_marker * max_size_status + "+")
+
+ def output_matrix(self):
+ """
+ Generates a set of tables, groped by subsystem, containing
+ what's the feature state on each architecture.
+ """
+
+ title = "Feature status on all architectures"
+
+ self.emit("=" * len(title))
+ self.emit(title)
+ self.emit("=" * len(title))
+ self.emit()
+
+ desc_title = f"{self.h_kconfig} / {self.h_description}"
+
+ desc_size = self.max_size_kconfig + 4
+ if not self.description_size:
+ desc_size = max(self.max_size_description, desc_size)
+ else:
+ desc_size = max(self.description_size, desc_size)
+
+ desc_size = max(self.max_size_desc_word, desc_size, len(desc_title))
+
+ notcompat = "Not compatible"
+ self.max_size_status = max(self.max_size_status, len(notcompat))
+
+ min_status_size = self.max_size_status + self.max_size_arch + 4
+ max_size_status = max(min_status_size, self.max_size_status)
+
+ h_status_per_arch = "Status per architecture"
+ max_size_status = max(max_size_status, len(h_status_per_arch))
+
+ cur_subsys = None
+ for name in sorted(self.data.keys(),
+ key=lambda x: (self.data[x]["subsys"], x.lower())):
+ if not cur_subsys or cur_subsys != self.data[name]["subsys"]:
+ if cur_subsys:
+ self.emit()
+
+ cur_subsys = self.data[name]["subsys"]
+
+ title = f"Subsystem: {cur_subsys}"
+ self.emit(title)
+ self.emit("=" * len(title))
+ self.emit()
+
+ self.matrix_lines(desc_size, max_size_status, 0)
+
+ self.emit(f"|{self.h_name:<{self.max_size_name}}", end="")
+ self.emit(f"|{desc_title:<{desc_size}}", end="")
+ self.emit(f"|{h_status_per_arch:<{max_size_status}}|")
+
+ self.matrix_lines(desc_size, max_size_status, 1)
+
+ lines = []
+ descs = []
+ cur_status = ""
+ line = ""
+
+ arch_table = sorted(self.data[name]["table"].items(),
+ key=lambda x: (self.status_map.get(x[1], 99),
+ x[0].lower()))
+
+ for arch, status in arch_table:
+ if status == "---":
+ status = notcompat
+
+ if status != cur_status:
+ if line != "":
+ lines.append(line)
+ line = ""
+ line = f"- **{status}**: {arch}"
+ elif len(line) + len(arch) + 2 < max_size_status:
+ line += f", {arch}"
+ else:
+ lines.append(line)
+ line = f" {arch}"
+ cur_status = status
+
+ if line != "":
+ lines.append(line)
+
+ description = self.data[name]["description"]
+ while len(description) > desc_size:
+ desc_line = description[:desc_size]
+
+ last_space = desc_line.rfind(" ")
+ if last_space != -1:
+ desc_line = desc_line[:last_space]
+ descs.append(desc_line)
+ description = description[last_space + 1:]
+ else:
+ desc_line = desc_line[:-1]
+ descs.append(desc_line + "\\")
+ description = description[len(desc_line):]
+
+ if description:
+ descs.append(description)
+
+ while len(lines) < 2 + len(descs):
+ lines.append("")
+
+ for ln, line in enumerate(lines):
+ col = ["", ""]
+
+ if not ln:
+ col[0] = name
+ col[1] = f"``{self.data[name]['kconfig']}``"
+ else:
+ if ln >= 2 and descs:
+ col[1] = descs.pop(0)
+
+ self.emit(f"|{col[0]:<{self.max_size_name}}", end="")
+ self.emit(f"|{col[1]:<{desc_size}}", end="")
+ self.emit(f"|{line:<{max_size_status}}|")
+
+ self.matrix_lines(desc_size, max_size_status, 0)
+
+ return self.msg
+
+ def list_arch_features(self, arch, feat):
+ """
+ Print a matrix of kernel feature support for the chosen architecture.
+ """
+ self.emit("#")
+ self.emit(f"# Kernel feature support matrix of the '{arch}' architecture:")
+ self.emit("#")
+
+ # Sort by subsystem, then by feature name (case‑insensitive)
+ for name in sorted(self.data.keys(),
+ key=lambda n: (self.data[n]["subsys"].lower(),
+ n.lower())):
+ if feat and name != feat:
+ continue
+
+ feature = self.data[name]
+ arch_table = feature["table"]
+ status = arch_table.get(arch, "")
+ status = " " * ((4 - len(status)) // 2) + status
+
+ self.emit(f"{feature['subsys']:>{self.max_size_subsys + 1}}/ ",
+ end="")
+ self.emit(f"{name:<{self.max_size_name}}: ", end="")
+ self.emit(f"{status:<5}| ", end="")
+ self.emit(f"{feature['kconfig']:>{self.max_size_kconfig}} ",
+ end="")
+ self.emit(f"# {feature['description']}")
+
+ return self.msg
diff --git a/tools/lib/python/jobserver.py b/tools/lib/python/jobserver.py
new file mode 100755
index 000000000000..a24f30ef4fa8
--- /dev/null
+++ b/tools/lib/python/jobserver.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0+
+#
+# pylint: disable=C0103,C0209
+#
+#
+
+"""
+Interacts with the POSIX jobserver during the Kernel build time.
+
+A "normal" jobserver task, like the one initiated by a make subrocess would do:
+
+ - open read/write file descriptors to communicate with the job server;
+ - ask for one slot by calling:
+ claim = os.read(reader, 1)
+ - when the job finshes, call:
+ os.write(writer, b"+") # os.write(writer, claim)
+
+Here, the goal is different: This script aims to get the remaining number
+of slots available, using all of them to run a command which handle tasks in
+parallel. To to that, it has a loop that ends only after there are no
+slots left. It then increments the number by one, in order to allow a
+call equivalent to make -j$((claim+1)), e.g. having a parent make creating
+$claim child to do the actual work.
+
+The end goal here is to keep the total number of build tasks under the
+limit established by the initial make -j$n_proc call.
+
+See:
+ https://www.gnu.org/software/make/manual/html_node/POSIX-Jobserver.html#POSIX-Jobserver
+"""
+
+import errno
+import os
+import subprocess
+import sys
+
+class JobserverExec:
+ """
+ Claim all slots from make using POSIX Jobserver.
+
+ The main methods here are:
+ - open(): reserves all slots;
+ - close(): method returns all used slots back to make;
+ - run(): executes a command setting PARALLELISM=<available slots jobs + 1>
+ """
+
+ def __init__(self):
+ """Initialize internal vars"""
+ self.claim = 0
+ self.jobs = b""
+ self.reader = None
+ self.writer = None
+ self.is_open = False
+
+ def open(self):
+ """Reserve all available slots to be claimed later on"""
+
+ if self.is_open:
+ return
+
+ try:
+ # Fetch the make environment options.
+ flags = os.environ["MAKEFLAGS"]
+ # Look for "--jobserver=R,W"
+ # Note that GNU Make has used --jobserver-fds and --jobserver-auth
+ # so this handles all of them.
+ opts = [x for x in flags.split(" ") if x.startswith("--jobserver")]
+
+ # Parse out R,W file descriptor numbers and set them nonblocking.
+ # If the MAKEFLAGS variable contains multiple instances of the
+ # --jobserver-auth= option, the last one is relevant.
+ fds = opts[-1].split("=", 1)[1]
+
+ # Starting with GNU Make 4.4, named pipes are used for reader
+ # and writer.
+ # Example argument: --jobserver-auth=fifo:/tmp/GMfifo8134
+ _, _, path = fds.partition("fifo:")
+
+ if path:
+ self.reader = os.open(path, os.O_RDONLY | os.O_NONBLOCK)
+ self.writer = os.open(path, os.O_WRONLY)
+ else:
+ self.reader, self.writer = [int(x) for x in fds.split(",", 1)]
+ # Open a private copy of reader to avoid setting nonblocking
+ # on an unexpecting process with the same reader fd.
+ self.reader = os.open("/proc/self/fd/%d" % (self.reader),
+ os.O_RDONLY | os.O_NONBLOCK)
+
+ # Read out as many jobserver slots as possible
+ while True:
+ try:
+ slot = os.read(self.reader, 8)
+ self.jobs += slot
+ except (OSError, IOError) as e:
+ if e.errno == errno.EWOULDBLOCK:
+ # Stop at the end of the jobserver queue.
+ break
+ # If something went wrong, give back the jobs.
+ if self.jobs:
+ os.write(self.writer, self.jobs)
+ raise e
+
+ # Add a bump for our caller's reserveration, since we're just going
+ # to sit here blocked on our child.
+ self.claim = len(self.jobs) + 1
+
+ except (KeyError, IndexError, ValueError, OSError, IOError):
+ # Any missing environment strings or bad fds should result in just
+ # not being parallel.
+ self.claim = None
+
+ self.is_open = True
+
+ def close(self):
+ """Return all reserved slots to Jobserver"""
+
+ if not self.is_open:
+ return
+
+ # Return all the reserved slots.
+ if len(self.jobs):
+ os.write(self.writer, self.jobs)
+
+ self.is_open = False
+
+ def __enter__(self):
+ self.open()
+ return self
+
+ def __exit__(self, exc_type, exc_value, exc_traceback):
+ self.close()
+
+ def run(self, cmd, *args, **pwargs):
+ """
+ Run a command setting PARALLELISM env variable to the number of
+ available job slots (claim) + 1, e.g. it will reserve claim slots
+ to do the actual build work, plus one to monitor its children.
+ """
+ self.open() # Ensure that self.claim is set
+
+ # We can only claim parallelism if there was a jobserver (i.e. a
+ # top-level "-jN" argument) and there were no other failures. Otherwise
+ # leave out the environment variable and let the child figure out what
+ # is best.
+ if self.claim:
+ os.environ["PARALLELISM"] = str(self.claim)
+
+ return subprocess.call(cmd, *args, **pwargs)
diff --git a/tools/lib/python/kdoc/__init__.py b/tools/lib/python/kdoc/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/tools/lib/python/kdoc/__init__.py
diff --git a/tools/docs/lib/enrich_formatter.py b/tools/lib/python/kdoc/enrich_formatter.py
index bb171567a4ca..bb171567a4ca 100644
--- a/tools/docs/lib/enrich_formatter.py
+++ b/tools/lib/python/kdoc/enrich_formatter.py
diff --git a/tools/lib/python/kdoc/kdoc_files.py b/tools/lib/python/kdoc/kdoc_files.py
new file mode 100644
index 000000000000..bfe02baf1606
--- /dev/null
+++ b/tools/lib/python/kdoc/kdoc_files.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
+#
+# pylint: disable=R0903,R0913,R0914,R0917
+
+"""
+Parse lernel-doc tags on multiple kernel source files.
+"""
+
+import argparse
+import logging
+import os
+import re
+
+from kdoc.kdoc_parser import KernelDoc
+from kdoc.kdoc_output import OutputFormat
+
+
+class GlobSourceFiles:
+ """
+ Parse C source code file names and directories via an Interactor.
+ """
+
+ def __init__(self, srctree=None, valid_extensions=None):
+ """
+ Initialize valid extensions with a tuple.
+
+ If not defined, assume default C extensions (.c and .h)
+
+ It would be possible to use python's glob function, but it is
+ very slow, and it is not interactive. So, it would wait to read all
+ directories before actually do something.
+
+ So, let's use our own implementation.
+ """
+
+ if not valid_extensions:
+ self.extensions = (".c", ".h")
+ else:
+ self.extensions = valid_extensions
+
+ self.srctree = srctree
+
+ def _parse_dir(self, dirname):
+ """Internal function to parse files recursively"""
+
+ with os.scandir(dirname) as obj:
+ for entry in obj:
+ name = os.path.join(dirname, entry.name)
+
+ if entry.is_dir(follow_symlinks=False):
+ yield from self._parse_dir(name)
+
+ if not entry.is_file():
+ continue
+
+ basename = os.path.basename(name)
+
+ if not basename.endswith(self.extensions):
+ continue
+
+ yield name
+
+ def parse_files(self, file_list, file_not_found_cb):
+ """
+ Define an iterator to parse all source files from file_list,
+ handling directories if any
+ """
+
+ if not file_list:
+ return
+
+ for fname in file_list:
+ if self.srctree:
+ f = os.path.join(self.srctree, fname)
+ else:
+ f = fname
+
+ if os.path.isdir(f):
+ yield from self._parse_dir(f)
+ elif os.path.isfile(f):
+ yield f
+ elif file_not_found_cb:
+ file_not_found_cb(fname)
+
+
+class KernelFiles():
+ """
+ Parse kernel-doc tags on multiple kernel source files.
+
+ There are two type of parsers defined here:
+ - self.parse_file(): parses both kernel-doc markups and
+ EXPORT_SYMBOL* macros;
+ - self.process_export_file(): parses only EXPORT_SYMBOL* macros.
+ """
+
+ def warning(self, msg):
+ """Ancillary routine to output a warning and increment error count"""
+
+ self.config.log.warning(msg)
+ self.errors += 1
+
+ def error(self, msg):
+ """Ancillary routine to output an error and increment error count"""
+
+ self.config.log.error(msg)
+ self.errors += 1
+
+ def parse_file(self, fname):
+ """
+ Parse a single Kernel source.
+ """
+
+ # Prevent parsing the same file twice if results are cached
+ if fname in self.files:
+ return
+
+ doc = KernelDoc(self.config, fname)
+ export_table, entries = doc.parse_kdoc()
+
+ self.export_table[fname] = export_table
+
+ self.files.add(fname)
+ self.export_files.add(fname) # parse_kdoc() already check exports
+
+ self.results[fname] = entries
+
+ def process_export_file(self, fname):
+ """
+ Parses EXPORT_SYMBOL* macros from a single Kernel source file.
+ """
+
+ # Prevent parsing the same file twice if results are cached
+ if fname in self.export_files:
+ return
+
+ doc = KernelDoc(self.config, fname)
+ export_table = doc.parse_export()
+
+ if not export_table:
+ self.error(f"Error: Cannot check EXPORT_SYMBOL* on {fname}")
+ export_table = set()
+
+ self.export_table[fname] = export_table
+ self.export_files.add(fname)
+
+ def file_not_found_cb(self, fname):
+ """
+ Callback to warn if a file was not found.
+ """
+
+ self.error(f"Cannot find file {fname}")
+
+ def __init__(self, verbose=False, out_style=None,
+ werror=False, wreturn=False, wshort_desc=False,
+ wcontents_before_sections=False,
+ logger=None):
+ """
+ Initialize startup variables and parse all files
+ """
+
+ if not verbose:
+ verbose = bool(os.environ.get("KBUILD_VERBOSE", 0))
+
+ if out_style is None:
+ out_style = OutputFormat()
+
+ if not werror:
+ kcflags = os.environ.get("KCFLAGS", None)
+ if kcflags:
+ match = re.search(r"(\s|^)-Werror(\s|$)/", kcflags)
+ if match:
+ werror = True
+
+ # reading this variable is for backwards compat just in case
+ # someone was calling it with the variable from outside the
+ # kernel's build system
+ kdoc_werror = os.environ.get("KDOC_WERROR", None)
+ if kdoc_werror:
+ werror = kdoc_werror
+
+ # Some variables are global to the parser logic as a whole as they are
+ # used to send control configuration to KernelDoc class. As such,
+ # those variables are read-only inside the KernelDoc.
+ self.config = argparse.Namespace
+
+ self.config.verbose = verbose
+ self.config.werror = werror
+ self.config.wreturn = wreturn
+ self.config.wshort_desc = wshort_desc
+ self.config.wcontents_before_sections = wcontents_before_sections
+
+ if not logger:
+ self.config.log = logging.getLogger("kernel-doc")
+ else:
+ self.config.log = logger
+
+ self.config.warning = self.warning
+
+ self.config.src_tree = os.environ.get("SRCTREE", None)
+
+ # Initialize variables that are internal to KernelFiles
+
+ self.out_style = out_style
+
+ self.errors = 0
+ self.results = {}
+
+ self.files = set()
+ self.export_files = set()
+ self.export_table = {}
+
+ def parse(self, file_list, export_file=None):
+ """
+ Parse all files
+ """
+
+ glob = GlobSourceFiles(srctree=self.config.src_tree)
+
+ for fname in glob.parse_files(file_list, self.file_not_found_cb):
+ self.parse_file(fname)
+
+ for fname in glob.parse_files(export_file, self.file_not_found_cb):
+ self.process_export_file(fname)
+
+ def out_msg(self, fname, name, arg):
+ """
+ Return output messages from a file name using the output style
+ filtering.
+
+ If output type was not handled by the styler, return None.
+ """
+
+ # NOTE: we can add rules here to filter out unwanted parts,
+ # although OutputFormat.msg already does that.
+
+ return self.out_style.msg(fname, name, arg)
+
+ def msg(self, enable_lineno=False, export=False, internal=False,
+ symbol=None, nosymbol=None, no_doc_sections=False,
+ filenames=None, export_file=None):
+ """
+ Interacts over the kernel-doc results and output messages,
+ returning kernel-doc markups on each interaction
+ """
+
+ self.out_style.set_config(self.config)
+
+ if not filenames:
+ filenames = sorted(self.results.keys())
+
+ glob = GlobSourceFiles(srctree=self.config.src_tree)
+
+ for fname in filenames:
+ function_table = set()
+
+ if internal or export:
+ if not export_file:
+ export_file = [fname]
+
+ for f in glob.parse_files(export_file, self.file_not_found_cb):
+ function_table |= self.export_table[f]
+
+ if symbol:
+ for s in symbol:
+ function_table.add(s)
+
+ self.out_style.set_filter(export, internal, symbol, nosymbol,
+ function_table, enable_lineno,
+ no_doc_sections)
+
+ msg = ""
+ if fname not in self.results:
+ self.config.log.warning("No kernel-doc for file %s", fname)
+ continue
+
+ symbols = self.results[fname]
+ self.out_style.set_symbols(symbols)
+
+ for arg in symbols:
+ m = self.out_msg(fname, arg.name, arg)
+
+ if m is None:
+ ln = arg.get("ln", 0)
+ dtype = arg.get('type', "")
+
+ self.config.log.warning("%s:%d Can't handle %s",
+ fname, ln, dtype)
+ else:
+ msg += m
+
+ if msg:
+ yield fname, msg
diff --git a/tools/lib/python/kdoc/kdoc_item.py b/tools/lib/python/kdoc/kdoc_item.py
new file mode 100644
index 000000000000..19805301cb2c
--- /dev/null
+++ b/tools/lib/python/kdoc/kdoc_item.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# A class that will, eventually, encapsulate all of the parsed data that we
+# then pass into the output modules.
+#
+
+class KdocItem:
+ def __init__(self, name, fname, type, start_line, **other_stuff):
+ self.name = name
+ self.fname = fname
+ self.type = type
+ self.declaration_start_line = start_line
+ self.sections = {}
+ self.sections_start_lines = {}
+ self.parameterlist = []
+ self.parameterdesc_start_lines = []
+ self.parameterdescs = {}
+ self.parametertypes = {}
+ #
+ # Just save everything else into our own dict so that the output
+ # side can grab it directly as before. As we move things into more
+ # structured data, this will, hopefully, fade away.
+ #
+ self.other_stuff = other_stuff
+
+ def get(self, key, default = None):
+ return self.other_stuff.get(key, default)
+
+ def __getitem__(self, key):
+ return self.get(key)
+
+ #
+ # Tracking of section and parameter information.
+ #
+ def set_sections(self, sections, start_lines):
+ self.sections = sections
+ self.section_start_lines = start_lines
+
+ def set_params(self, names, descs, types, starts):
+ self.parameterlist = names
+ self.parameterdescs = descs
+ self.parametertypes = types
+ self.parameterdesc_start_lines = starts
diff --git a/tools/lib/python/kdoc/kdoc_output.py b/tools/lib/python/kdoc/kdoc_output.py
new file mode 100644
index 000000000000..b1aaa7fc3604
--- /dev/null
+++ b/tools/lib/python/kdoc/kdoc_output.py
@@ -0,0 +1,824 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
+#
+# pylint: disable=C0301,R0902,R0911,R0912,R0913,R0914,R0915,R0917
+
+"""
+Implement output filters to print kernel-doc documentation.
+
+The implementation uses a virtual base class (OutputFormat) which
+contains dispatches to virtual methods, and some code to filter
+out output messages.
+
+The actual implementation is done on one separate class per each type
+of output. Currently, there are output classes for ReST and man/troff.
+"""
+
+import os
+import re
+from datetime import datetime
+
+from kdoc.kdoc_parser import KernelDoc, type_param
+from kdoc.kdoc_re import KernRe
+
+
+function_pointer = KernRe(r"([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)", cache=False)
+
+# match expressions used to find embedded type information
+type_constant = KernRe(r"\b``([^\`]+)``\b", cache=False)
+type_constant2 = KernRe(r"\%([-_*\w]+)", cache=False)
+type_func = KernRe(r"(\w+)\(\)", cache=False)
+type_param_ref = KernRe(r"([\!~\*]?)\@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)", cache=False)
+
+# Special RST handling for func ptr params
+type_fp_param = KernRe(r"\@(\w+)\(\)", cache=False)
+
+# Special RST handling for structs with func ptr params
+type_fp_param2 = KernRe(r"\@(\w+->\S+)\(\)", cache=False)
+
+type_env = KernRe(r"(\$\w+)", cache=False)
+type_enum = KernRe(r"\&(enum\s*([_\w]+))", cache=False)
+type_struct = KernRe(r"\&(struct\s*([_\w]+))", cache=False)
+type_typedef = KernRe(r"\&(typedef\s*([_\w]+))", cache=False)
+type_union = KernRe(r"\&(union\s*([_\w]+))", cache=False)
+type_member = KernRe(r"\&([_\w]+)(\.|->)([_\w]+)", cache=False)
+type_fallback = KernRe(r"\&([_\w]+)", cache=False)
+type_member_func = type_member + KernRe(r"\(\)", cache=False)
+
+
+class OutputFormat:
+ """
+ Base class for OutputFormat. If used as-is, it means that only
+ warnings will be displayed.
+ """
+
+ # output mode.
+ OUTPUT_ALL = 0 # output all symbols and doc sections
+ OUTPUT_INCLUDE = 1 # output only specified symbols
+ OUTPUT_EXPORTED = 2 # output exported symbols
+ OUTPUT_INTERNAL = 3 # output non-exported symbols
+
+ # Virtual member to be overridden at the inherited classes
+ highlights = []
+
+ def __init__(self):
+ """Declare internal vars and set mode to OUTPUT_ALL"""
+
+ self.out_mode = self.OUTPUT_ALL
+ self.enable_lineno = None
+ self.nosymbol = {}
+ self.symbol = None
+ self.function_table = None
+ self.config = None
+ self.no_doc_sections = False
+
+ self.data = ""
+
+ def set_config(self, config):
+ """
+ Setup global config variables used by both parser and output.
+ """
+
+ self.config = config
+
+ def set_filter(self, export, internal, symbol, nosymbol, function_table,
+ enable_lineno, no_doc_sections):
+ """
+ Initialize filter variables according to the requested mode.
+
+ Only one choice is valid between export, internal and symbol.
+
+ The nosymbol filter can be used on all modes.
+ """
+
+ self.enable_lineno = enable_lineno
+ self.no_doc_sections = no_doc_sections
+ self.function_table = function_table
+
+ if symbol:
+ self.out_mode = self.OUTPUT_INCLUDE
+ elif export:
+ self.out_mode = self.OUTPUT_EXPORTED
+ elif internal:
+ self.out_mode = self.OUTPUT_INTERNAL
+ else:
+ self.out_mode = self.OUTPUT_ALL
+
+ if nosymbol:
+ self.nosymbol = set(nosymbol)
+
+
+ def highlight_block(self, block):
+ """
+ Apply the RST highlights to a sub-block of text.
+ """
+
+ for r, sub in self.highlights:
+ block = r.sub(sub, block)
+
+ return block
+
+ def out_warnings(self, args):
+ """
+ Output warnings for identifiers that will be displayed.
+ """
+
+ for log_msg in args.warnings:
+ self.config.warning(log_msg)
+
+ def check_doc(self, name, args):
+ """Check if DOC should be output"""
+
+ if self.no_doc_sections:
+ return False
+
+ if name in self.nosymbol:
+ return False
+
+ if self.out_mode == self.OUTPUT_ALL:
+ self.out_warnings(args)
+ return True
+
+ if self.out_mode == self.OUTPUT_INCLUDE:
+ if name in self.function_table:
+ self.out_warnings(args)
+ return True
+
+ return False
+
+ def check_declaration(self, dtype, name, args):
+ """
+ Checks if a declaration should be output or not based on the
+ filtering criteria.
+ """
+
+ if name in self.nosymbol:
+ return False
+
+ if self.out_mode == self.OUTPUT_ALL:
+ self.out_warnings(args)
+ return True
+
+ if self.out_mode in [self.OUTPUT_INCLUDE, self.OUTPUT_EXPORTED]:
+ if name in self.function_table:
+ return True
+
+ if self.out_mode == self.OUTPUT_INTERNAL:
+ if dtype != "function":
+ self.out_warnings(args)
+ return True
+
+ if name not in self.function_table:
+ self.out_warnings(args)
+ return True
+
+ return False
+
+ def msg(self, fname, name, args):
+ """
+ Handles a single entry from kernel-doc parser
+ """
+
+ self.data = ""
+
+ dtype = args.type
+
+ if dtype == "doc":
+ self.out_doc(fname, name, args)
+ return self.data
+
+ if not self.check_declaration(dtype, name, args):
+ return self.data
+
+ if dtype == "function":
+ self.out_function(fname, name, args)
+ return self.data
+
+ if dtype == "enum":
+ self.out_enum(fname, name, args)
+ return self.data
+
+ if dtype == "typedef":
+ self.out_typedef(fname, name, args)
+ return self.data
+
+ if dtype in ["struct", "union"]:
+ self.out_struct(fname, name, args)
+ return self.data
+
+ # Warn if some type requires an output logic
+ self.config.log.warning("doesn't know how to output '%s' block",
+ dtype)
+
+ return None
+
+ # Virtual methods to be overridden by inherited classes
+ # At the base class, those do nothing.
+ def set_symbols(self, symbols):
+ """Get a list of all symbols from kernel_doc"""
+
+ def out_doc(self, fname, name, args):
+ """Outputs a DOC block"""
+
+ def out_function(self, fname, name, args):
+ """Outputs a function"""
+
+ def out_enum(self, fname, name, args):
+ """Outputs an enum"""
+
+ def out_typedef(self, fname, name, args):
+ """Outputs a typedef"""
+
+ def out_struct(self, fname, name, args):
+ """Outputs a struct"""
+
+
+class RestFormat(OutputFormat):
+ """Consts and functions used by ReST output"""
+
+ highlights = [
+ (type_constant, r"``\1``"),
+ (type_constant2, r"``\1``"),
+
+ # Note: need to escape () to avoid func matching later
+ (type_member_func, r":c:type:`\1\2\3\\(\\) <\1>`"),
+ (type_member, r":c:type:`\1\2\3 <\1>`"),
+ (type_fp_param, r"**\1\\(\\)**"),
+ (type_fp_param2, r"**\1\\(\\)**"),
+ (type_func, r"\1()"),
+ (type_enum, r":c:type:`\1 <\2>`"),
+ (type_struct, r":c:type:`\1 <\2>`"),
+ (type_typedef, r":c:type:`\1 <\2>`"),
+ (type_union, r":c:type:`\1 <\2>`"),
+
+ # in rst this can refer to any type
+ (type_fallback, r":c:type:`\1`"),
+ (type_param_ref, r"**\1\2**")
+ ]
+ blankline = "\n"
+
+ sphinx_literal = KernRe(r'^[^.].*::$', cache=False)
+ sphinx_cblock = KernRe(r'^\.\.\ +code-block::', cache=False)
+
+ def __init__(self):
+ """
+ Creates class variables.
+
+ Not really mandatory, but it is a good coding style and makes
+ pylint happy.
+ """
+
+ super().__init__()
+ self.lineprefix = ""
+
+ def print_lineno(self, ln):
+ """Outputs a line number"""
+
+ if self.enable_lineno and ln is not None:
+ ln += 1
+ self.data += f".. LINENO {ln}\n"
+
+ def output_highlight(self, args):
+ """
+ Outputs a C symbol that may require being converted to ReST using
+ the self.highlights variable
+ """
+
+ input_text = args
+ output = ""
+ in_literal = False
+ litprefix = ""
+ block = ""
+
+ for line in input_text.strip("\n").split("\n"):
+
+ # If we're in a literal block, see if we should drop out of it.
+ # Otherwise, pass the line straight through unmunged.
+ if in_literal:
+ if line.strip(): # If the line is not blank
+ # If this is the first non-blank line in a literal block,
+ # figure out the proper indent.
+ if not litprefix:
+ r = KernRe(r'^(\s*)')
+ if r.match(line):
+ litprefix = '^' + r.group(1)
+ else:
+ litprefix = ""
+
+ output += line + "\n"
+ elif not KernRe(litprefix).match(line):
+ in_literal = False
+ else:
+ output += line + "\n"
+ else:
+ output += line + "\n"
+
+ # Not in a literal block (or just dropped out)
+ if not in_literal:
+ block += line + "\n"
+ if self.sphinx_literal.match(line) or self.sphinx_cblock.match(line):
+ in_literal = True
+ litprefix = ""
+ output += self.highlight_block(block)
+ block = ""
+
+ # Handle any remaining block
+ if block:
+ output += self.highlight_block(block)
+
+ # Print the output with the line prefix
+ for line in output.strip("\n").split("\n"):
+ self.data += self.lineprefix + line + "\n"
+
+ def out_section(self, args, out_docblock=False):
+ """
+ Outputs a block section.
+
+ This could use some work; it's used to output the DOC: sections, and
+ starts by putting out the name of the doc section itself, but that
+ tends to duplicate a header already in the template file.
+ """
+ for section, text in args.sections.items():
+ # Skip sections that are in the nosymbol_table
+ if section in self.nosymbol:
+ continue
+
+ if out_docblock:
+ if not self.out_mode == self.OUTPUT_INCLUDE:
+ self.data += f".. _{section}:\n\n"
+ self.data += f'{self.lineprefix}**{section}**\n\n'
+ else:
+ self.data += f'{self.lineprefix}**{section}**\n\n'
+
+ self.print_lineno(args.section_start_lines.get(section, 0))
+ self.output_highlight(text)
+ self.data += "\n"
+ self.data += "\n"
+
+ def out_doc(self, fname, name, args):
+ if not self.check_doc(name, args):
+ return
+ self.out_section(args, out_docblock=True)
+
+ def out_function(self, fname, name, args):
+
+ oldprefix = self.lineprefix
+ signature = ""
+
+ func_macro = args.get('func_macro', False)
+ if func_macro:
+ signature = name
+ else:
+ if args.get('functiontype'):
+ signature = args['functiontype'] + " "
+ signature += name + " ("
+
+ ln = args.declaration_start_line
+ count = 0
+ for parameter in args.parameterlist:
+ if count != 0:
+ signature += ", "
+ count += 1
+ dtype = args.parametertypes.get(parameter, "")
+
+ if function_pointer.search(dtype):
+ signature += function_pointer.group(1) + parameter + function_pointer.group(3)
+ else:
+ signature += dtype
+
+ if not func_macro:
+ signature += ")"
+
+ self.print_lineno(ln)
+ if args.get('typedef') or not args.get('functiontype'):
+ self.data += f".. c:macro:: {name}\n\n"
+
+ if args.get('typedef'):
+ self.data += " **Typedef**: "
+ self.lineprefix = ""
+ self.output_highlight(args.get('purpose', ""))
+ self.data += "\n\n**Syntax**\n\n"
+ self.data += f" ``{signature}``\n\n"
+ else:
+ self.data += f"``{signature}``\n\n"
+ else:
+ self.data += f".. c:function:: {signature}\n\n"
+
+ if not args.get('typedef'):
+ self.print_lineno(ln)
+ self.lineprefix = " "
+ self.output_highlight(args.get('purpose', ""))
+ self.data += "\n"
+
+ # Put descriptive text into a container (HTML <div>) to help set
+ # function prototypes apart
+ self.lineprefix = " "
+
+ if args.parameterlist:
+ self.data += ".. container:: kernelindent\n\n"
+ self.data += f"{self.lineprefix}**Parameters**\n\n"
+
+ for parameter in args.parameterlist:
+ parameter_name = KernRe(r'\[.*').sub('', parameter)
+ dtype = args.parametertypes.get(parameter, "")
+
+ if dtype:
+ self.data += f"{self.lineprefix}``{dtype}``\n"
+ else:
+ self.data += f"{self.lineprefix}``{parameter}``\n"
+
+ self.print_lineno(args.parameterdesc_start_lines.get(parameter_name, 0))
+
+ self.lineprefix = " "
+ if parameter_name in args.parameterdescs and \
+ args.parameterdescs[parameter_name] != KernelDoc.undescribed:
+
+ self.output_highlight(args.parameterdescs[parameter_name])
+ self.data += "\n"
+ else:
+ self.data += f"{self.lineprefix}*undescribed*\n\n"
+ self.lineprefix = " "
+
+ self.out_section(args)
+ self.lineprefix = oldprefix
+
+ def out_enum(self, fname, name, args):
+
+ oldprefix = self.lineprefix
+ ln = args.declaration_start_line
+
+ self.data += f"\n\n.. c:enum:: {name}\n\n"
+
+ self.print_lineno(ln)
+ self.lineprefix = " "
+ self.output_highlight(args.get('purpose', ''))
+ self.data += "\n"
+
+ self.data += ".. container:: kernelindent\n\n"
+ outer = self.lineprefix + " "
+ self.lineprefix = outer + " "
+ self.data += f"{outer}**Constants**\n\n"
+
+ for parameter in args.parameterlist:
+ self.data += f"{outer}``{parameter}``\n"
+
+ if args.parameterdescs.get(parameter, '') != KernelDoc.undescribed:
+ self.output_highlight(args.parameterdescs[parameter])
+ else:
+ self.data += f"{self.lineprefix}*undescribed*\n\n"
+ self.data += "\n"
+
+ self.lineprefix = oldprefix
+ self.out_section(args)
+
+ def out_typedef(self, fname, name, args):
+
+ oldprefix = self.lineprefix
+ ln = args.declaration_start_line
+
+ self.data += f"\n\n.. c:type:: {name}\n\n"
+
+ self.print_lineno(ln)
+ self.lineprefix = " "
+
+ self.output_highlight(args.get('purpose', ''))
+
+ self.data += "\n"
+
+ self.lineprefix = oldprefix
+ self.out_section(args)
+
+ def out_struct(self, fname, name, args):
+
+ purpose = args.get('purpose', "")
+ declaration = args.get('definition', "")
+ dtype = args.type
+ ln = args.declaration_start_line
+
+ self.data += f"\n\n.. c:{dtype}:: {name}\n\n"
+
+ self.print_lineno(ln)
+
+ oldprefix = self.lineprefix
+ self.lineprefix += " "
+
+ self.output_highlight(purpose)
+ self.data += "\n"
+
+ self.data += ".. container:: kernelindent\n\n"
+ self.data += f"{self.lineprefix}**Definition**::\n\n"
+
+ self.lineprefix = self.lineprefix + " "
+
+ declaration = declaration.replace("\t", self.lineprefix)
+
+ self.data += f"{self.lineprefix}{dtype} {name}" + ' {' + "\n"
+ self.data += f"{declaration}{self.lineprefix}" + "};\n\n"
+
+ self.lineprefix = " "
+ self.data += f"{self.lineprefix}**Members**\n\n"
+ for parameter in args.parameterlist:
+ if not parameter or parameter.startswith("#"):
+ continue
+
+ parameter_name = parameter.split("[", maxsplit=1)[0]
+
+ if args.parameterdescs.get(parameter_name) == KernelDoc.undescribed:
+ continue
+
+ self.print_lineno(args.parameterdesc_start_lines.get(parameter_name, 0))
+
+ self.data += f"{self.lineprefix}``{parameter}``\n"
+
+ self.lineprefix = " "
+ self.output_highlight(args.parameterdescs[parameter_name])
+ self.lineprefix = " "
+
+ self.data += "\n"
+
+ self.data += "\n"
+
+ self.lineprefix = oldprefix
+ self.out_section(args)
+
+
+class ManFormat(OutputFormat):
+ """Consts and functions used by man pages output"""
+
+ highlights = (
+ (type_constant, r"\1"),
+ (type_constant2, r"\1"),
+ (type_func, r"\\fB\1\\fP"),
+ (type_enum, r"\\fI\1\\fP"),
+ (type_struct, r"\\fI\1\\fP"),
+ (type_typedef, r"\\fI\1\\fP"),
+ (type_union, r"\\fI\1\\fP"),
+ (type_param, r"\\fI\1\\fP"),
+ (type_param_ref, r"\\fI\1\2\\fP"),
+ (type_member, r"\\fI\1\2\3\\fP"),
+ (type_fallback, r"\\fI\1\\fP")
+ )
+ blankline = ""
+
+ date_formats = [
+ "%a %b %d %H:%M:%S %Z %Y",
+ "%a %b %d %H:%M:%S %Y",
+ "%Y-%m-%d",
+ "%b %d %Y",
+ "%B %d %Y",
+ "%m %d %Y",
+ ]
+
+ def __init__(self, modulename):
+ """
+ Creates class variables.
+
+ Not really mandatory, but it is a good coding style and makes
+ pylint happy.
+ """
+
+ super().__init__()
+ self.modulename = modulename
+ self.symbols = []
+
+ dt = None
+ tstamp = os.environ.get("KBUILD_BUILD_TIMESTAMP")
+ if tstamp:
+ for fmt in self.date_formats:
+ try:
+ dt = datetime.strptime(tstamp, fmt)
+ break
+ except ValueError:
+ pass
+
+ if not dt:
+ dt = datetime.now()
+
+ self.man_date = dt.strftime("%B %Y")
+
+ def arg_name(self, args, name):
+ """
+ Return the name that will be used for the man page.
+
+ As we may have the same name on different namespaces,
+ prepend the data type for all types except functions and typedefs.
+
+ The doc section is special: it uses the modulename.
+ """
+
+ dtype = args.type
+
+ if dtype == "doc":
+ return self.modulename
+
+ if dtype in ["function", "typedef"]:
+ return name
+
+ return f"{dtype} {name}"
+
+ def set_symbols(self, symbols):
+ """
+ Get a list of all symbols from kernel_doc.
+
+ Man pages will uses it to add a SEE ALSO section with other
+ symbols at the same file.
+ """
+ self.symbols = symbols
+
+ def out_tail(self, fname, name, args):
+ """Adds a tail for all man pages"""
+
+ # SEE ALSO section
+ self.data += f'.SH "SEE ALSO"' + "\n.PP\n"
+ self.data += (f"Kernel file \\fB{args.fname}\\fR\n")
+ if len(self.symbols) >= 2:
+ cur_name = self.arg_name(args, name)
+
+ related = []
+ for arg in self.symbols:
+ out_name = self.arg_name(arg, arg.name)
+
+ if cur_name == out_name:
+ continue
+
+ related.append(f"\\fB{out_name}\\fR(9)")
+
+ self.data += ",\n".join(related) + "\n"
+
+ # TODO: does it make sense to add other sections? Maybe
+ # REPORTING ISSUES? LICENSE?
+
+ def msg(self, fname, name, args):
+ """
+ Handles a single entry from kernel-doc parser.
+
+ Add a tail at the end of man pages output.
+ """
+ super().msg(fname, name, args)
+ self.out_tail(fname, name, args)
+
+ return self.data
+
+ def output_highlight(self, block):
+ """
+ Outputs a C symbol that may require being highlighted with
+ self.highlights variable using troff syntax
+ """
+
+ contents = self.highlight_block(block)
+
+ if isinstance(contents, list):
+ contents = "\n".join(contents)
+
+ for line in contents.strip("\n").split("\n"):
+ line = KernRe(r"^\s*").sub("", line)
+ if not line:
+ continue
+
+ if line[0] == ".":
+ self.data += "\\&" + line + "\n"
+ else:
+ self.data += line + "\n"
+
+ def out_doc(self, fname, name, args):
+ if not self.check_doc(name, args):
+ return
+
+ out_name = self.arg_name(args, name)
+
+ self.data += f'.TH "{self.modulename}" 9 "{out_name}" "{self.man_date}" "API Manual" LINUX' + "\n"
+
+ for section, text in args.sections.items():
+ self.data += f'.SH "{section}"' + "\n"
+ self.output_highlight(text)
+
+ def out_function(self, fname, name, args):
+ """output function in man"""
+
+ out_name = self.arg_name(args, name)
+
+ self.data += f'.TH "{name}" 9 "{out_name}" "{self.man_date}" "Kernel Hacker\'s Manual" LINUX' + "\n"
+
+ self.data += ".SH NAME\n"
+ self.data += f"{name} \\- {args['purpose']}\n"
+
+ self.data += ".SH SYNOPSIS\n"
+ if args.get('functiontype', ''):
+ self.data += f'.B "{args["functiontype"]}" {name}' + "\n"
+ else:
+ self.data += f'.B "{name}' + "\n"
+
+ count = 0
+ parenth = "("
+ post = ","
+
+ for parameter in args.parameterlist:
+ if count == len(args.parameterlist) - 1:
+ post = ");"
+
+ dtype = args.parametertypes.get(parameter, "")
+ if function_pointer.match(dtype):
+ # Pointer-to-function
+ self.data += f'".BI "{parenth}{function_pointer.group(1)}" " ") ({function_pointer.group(2)}){post}"' + "\n"
+ else:
+ dtype = KernRe(r'([^\*])$').sub(r'\1 ', dtype)
+
+ self.data += f'.BI "{parenth}{dtype}" "{post}"' + "\n"
+ count += 1
+ parenth = ""
+
+ if args.parameterlist:
+ self.data += ".SH ARGUMENTS\n"
+
+ for parameter in args.parameterlist:
+ parameter_name = re.sub(r'\[.*', '', parameter)
+
+ self.data += f'.IP "{parameter}" 12' + "\n"
+ self.output_highlight(args.parameterdescs.get(parameter_name, ""))
+
+ for section, text in args.sections.items():
+ self.data += f'.SH "{section.upper()}"' + "\n"
+ self.output_highlight(text)
+
+ def out_enum(self, fname, name, args):
+ out_name = self.arg_name(args, name)
+
+ self.data += f'.TH "{self.modulename}" 9 "{out_name}" "{self.man_date}" "API Manual" LINUX' + "\n"
+
+ self.data += ".SH NAME\n"
+ self.data += f"enum {name} \\- {args['purpose']}\n"
+
+ self.data += ".SH SYNOPSIS\n"
+ self.data += f"enum {name}" + " {\n"
+
+ count = 0
+ for parameter in args.parameterlist:
+ self.data += f'.br\n.BI " {parameter}"' + "\n"
+ if count == len(args.parameterlist) - 1:
+ self.data += "\n};\n"
+ else:
+ self.data += ", \n.br\n"
+
+ count += 1
+
+ self.data += ".SH Constants\n"
+
+ for parameter in args.parameterlist:
+ parameter_name = KernRe(r'\[.*').sub('', parameter)
+ self.data += f'.IP "{parameter}" 12' + "\n"
+ self.output_highlight(args.parameterdescs.get(parameter_name, ""))
+
+ for section, text in args.sections.items():
+ self.data += f'.SH "{section}"' + "\n"
+ self.output_highlight(text)
+
+ def out_typedef(self, fname, name, args):
+ module = self.modulename
+ purpose = args.get('purpose')
+ out_name = self.arg_name(args, name)
+
+ self.data += f'.TH "{module}" 9 "{out_name}" "{self.man_date}" "API Manual" LINUX' + "\n"
+
+ self.data += ".SH NAME\n"
+ self.data += f"typedef {name} \\- {purpose}\n"
+
+ for section, text in args.sections.items():
+ self.data += f'.SH "{section}"' + "\n"
+ self.output_highlight(text)
+
+ def out_struct(self, fname, name, args):
+ module = self.modulename
+ purpose = args.get('purpose')
+ definition = args.get('definition')
+ out_name = self.arg_name(args, name)
+
+ self.data += f'.TH "{module}" 9 "{out_name}" "{self.man_date}" "API Manual" LINUX' + "\n"
+
+ self.data += ".SH NAME\n"
+ self.data += f"{args.type} {name} \\- {purpose}\n"
+
+ # Replace tabs with two spaces and handle newlines
+ declaration = definition.replace("\t", " ")
+ declaration = KernRe(r"\n").sub('"\n.br\n.BI "', declaration)
+
+ self.data += ".SH SYNOPSIS\n"
+ self.data += f"{args.type} {name} " + "{" + "\n.br\n"
+ self.data += f'.BI "{declaration}\n' + "};\n.br\n\n"
+
+ self.data += ".SH Members\n"
+ for parameter in args.parameterlist:
+ if parameter.startswith("#"):
+ continue
+
+ parameter_name = re.sub(r"\[.*", "", parameter)
+
+ if args.parameterdescs.get(parameter_name) == KernelDoc.undescribed:
+ continue
+
+ self.data += f'.IP "{parameter}" 12' + "\n"
+ self.output_highlight(args.parameterdescs.get(parameter_name))
+
+ for section, text in args.sections.items():
+ self.data += f'.SH "{section}"' + "\n"
+ self.output_highlight(text)
diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py
new file mode 100644
index 000000000000..500aafc50032
--- /dev/null
+++ b/tools/lib/python/kdoc/kdoc_parser.py
@@ -0,0 +1,1670 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
+#
+# pylint: disable=C0301,C0302,R0904,R0912,R0913,R0914,R0915,R0917,R1702
+
+"""
+kdoc_parser
+===========
+
+Read a C language source or header FILE and extract embedded
+documentation comments
+"""
+
+import sys
+import re
+from pprint import pformat
+
+from kdoc.kdoc_re import NestedMatch, KernRe
+from kdoc.kdoc_item import KdocItem
+
+#
+# Regular expressions used to parse kernel-doc markups at KernelDoc class.
+#
+# Let's declare them in lowercase outside any class to make it easier to
+# convert from the Perl script.
+#
+# As those are evaluated at the beginning, no need to cache them
+#
+
+# Allow whitespace at end of comment start.
+doc_start = KernRe(r'^/\*\*\s*$', cache=False)
+
+doc_end = KernRe(r'\*/', cache=False)
+doc_com = KernRe(r'\s*\*\s*', cache=False)
+doc_com_body = KernRe(r'\s*\* ?', cache=False)
+doc_decl = doc_com + KernRe(r'(\w+)', cache=False)
+
+# @params and a strictly limited set of supported section names
+# Specifically:
+# Match @word:
+# @...:
+# @{section-name}:
+# while trying to not match literal block starts like "example::"
+#
+known_section_names = 'description|context|returns?|notes?|examples?'
+known_sections = KernRe(known_section_names, flags = re.I)
+doc_sect = doc_com + \
+ KernRe(r'\s*(@[.\w]+|@\.\.\.|' + known_section_names + r')\s*:([^:].*)?$',
+ flags=re.I, cache=False)
+
+doc_content = doc_com_body + KernRe(r'(.*)', cache=False)
+doc_inline_start = KernRe(r'^\s*/\*\*\s*$', cache=False)
+doc_inline_sect = KernRe(r'\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)', cache=False)
+doc_inline_end = KernRe(r'^\s*\*/\s*$', cache=False)
+doc_inline_oneline = KernRe(r'^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$', cache=False)
+
+export_symbol = KernRe(r'^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*', cache=False)
+export_symbol_ns = KernRe(r'^\s*EXPORT_SYMBOL_NS(_GPL)?\s*\(\s*(\w+)\s*,\s*"\S+"\)\s*', cache=False)
+
+type_param = KernRe(r"@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)", cache=False)
+
+#
+# Tests for the beginning of a kerneldoc block in its various forms.
+#
+doc_block = doc_com + KernRe(r'DOC:\s*(.*)?', cache=False)
+doc_begin_data = KernRe(r"^\s*\*?\s*(struct|union|enum|typedef)\b\s*(\w*)", cache = False)
+doc_begin_func = KernRe(str(doc_com) + # initial " * '
+ r"(?:\w+\s*\*\s*)?" + # type (not captured)
+ r'(?:define\s+)?' + # possible "define" (not captured)
+ r'(\w+)\s*(?:\(\w*\))?\s*' + # name and optional "(...)"
+ r'(?:[-:].*)?$', # description (not captured)
+ cache = False)
+
+#
+# Here begins a long set of transformations to turn structure member prefixes
+# and macro invocations into something we can parse and generate kdoc for.
+#
+struct_args_pattern = r'([^,)]+)'
+
+struct_xforms = [
+ # Strip attributes
+ (KernRe(r"__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)", flags=re.I | re.S, cache=False), ' '),
+ (KernRe(r'\s*__aligned\s*\([^;]*\)', re.S), ' '),
+ (KernRe(r'\s*__counted_by\s*\([^;]*\)', re.S), ' '),
+ (KernRe(r'\s*__counted_by_(le|be)\s*\([^;]*\)', re.S), ' '),
+ (KernRe(r'\s*__packed\s*', re.S), ' '),
+ (KernRe(r'\s*CRYPTO_MINALIGN_ATTR', re.S), ' '),
+ (KernRe(r'\s*__private', re.S), ' '),
+ (KernRe(r'\s*__rcu', re.S), ' '),
+ (KernRe(r'\s*____cacheline_aligned_in_smp', re.S), ' '),
+ (KernRe(r'\s*____cacheline_aligned', re.S), ' '),
+ (KernRe(r'\s*__cacheline_group_(begin|end)\([^\)]+\);'), ''),
+ #
+ # Unwrap struct_group macros based on this definition:
+ # __struct_group(TAG, NAME, ATTRS, MEMBERS...)
+ # which has variants like: struct_group(NAME, MEMBERS...)
+ # Only MEMBERS arguments require documentation.
+ #
+ # Parsing them happens on two steps:
+ #
+ # 1. drop struct group arguments that aren't at MEMBERS,
+ # storing them as STRUCT_GROUP(MEMBERS)
+ #
+ # 2. remove STRUCT_GROUP() ancillary macro.
+ #
+ # The original logic used to remove STRUCT_GROUP() using an
+ # advanced regex:
+ #
+ # \bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;
+ #
+ # with two patterns that are incompatible with
+ # Python re module, as it has:
+ #
+ # - a recursive pattern: (?1)
+ # - an atomic grouping: (?>...)
+ #
+ # I tried a simpler version: but it didn't work either:
+ # \bSTRUCT_GROUP\(([^\)]+)\)[^;]*;
+ #
+ # As it doesn't properly match the end parenthesis on some cases.
+ #
+ # So, a better solution was crafted: there's now a NestedMatch
+ # class that ensures that delimiters after a search are properly
+ # matched. So, the implementation to drop STRUCT_GROUP() will be
+ # handled in separate.
+ #
+ (KernRe(r'\bstruct_group\s*\(([^,]*,)', re.S), r'STRUCT_GROUP('),
+ (KernRe(r'\bstruct_group_attr\s*\(([^,]*,){2}', re.S), r'STRUCT_GROUP('),
+ (KernRe(r'\bstruct_group_tagged\s*\(([^,]*),([^,]*),', re.S), r'struct \1 \2; STRUCT_GROUP('),
+ (KernRe(r'\b__struct_group\s*\(([^,]*,){3}', re.S), r'STRUCT_GROUP('),
+ #
+ # Replace macros
+ #
+ # TODO: use NestedMatch for FOO($1, $2, ...) matches
+ #
+ # it is better to also move those to the NestedMatch logic,
+ # to ensure that parentheses will be properly matched.
+ #
+ (KernRe(r'__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)', re.S),
+ r'DECLARE_BITMAP(\1, __ETHTOOL_LINK_MODE_MASK_NBITS)'),
+ (KernRe(r'DECLARE_PHY_INTERFACE_MASK\s*\(([^\)]+)\)', re.S),
+ r'DECLARE_BITMAP(\1, PHY_INTERFACE_MODE_MAX)'),
+ (KernRe(r'DECLARE_BITMAP\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + r'\)',
+ re.S), r'unsigned long \1[BITS_TO_LONGS(\2)]'),
+ (KernRe(r'DECLARE_HASHTABLE\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + r'\)',
+ re.S), r'unsigned long \1[1 << ((\2) - 1)]'),
+ (KernRe(r'DECLARE_KFIFO\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern +
+ r',\s*' + struct_args_pattern + r'\)', re.S), r'\2 *\1'),
+ (KernRe(r'DECLARE_KFIFO_PTR\s*\(' + struct_args_pattern + r',\s*' +
+ struct_args_pattern + r'\)', re.S), r'\2 *\1'),
+ (KernRe(r'(?:__)?DECLARE_FLEX_ARRAY\s*\(' + struct_args_pattern + r',\s*' +
+ struct_args_pattern + r'\)', re.S), r'\1 \2[]'),
+ (KernRe(r'DEFINE_DMA_UNMAP_ADDR\s*\(' + struct_args_pattern + r'\)', re.S), r'dma_addr_t \1'),
+ (KernRe(r'DEFINE_DMA_UNMAP_LEN\s*\(' + struct_args_pattern + r'\)', re.S), r'__u32 \1'),
+]
+#
+# Regexes here are guaranteed to have the end delimiter matching
+# the start delimiter. Yet, right now, only one replace group
+# is allowed.
+#
+struct_nested_prefixes = [
+ (re.compile(r'\bSTRUCT_GROUP\('), r'\1'),
+]
+
+#
+# Transforms for function prototypes
+#
+function_xforms = [
+ (KernRe(r"^static +"), ""),
+ (KernRe(r"^extern +"), ""),
+ (KernRe(r"^asmlinkage +"), ""),
+ (KernRe(r"^inline +"), ""),
+ (KernRe(r"^__inline__ +"), ""),
+ (KernRe(r"^__inline +"), ""),
+ (KernRe(r"^__always_inline +"), ""),
+ (KernRe(r"^noinline +"), ""),
+ (KernRe(r"^__FORTIFY_INLINE +"), ""),
+ (KernRe(r"__init +"), ""),
+ (KernRe(r"__init_or_module +"), ""),
+ (KernRe(r"__deprecated +"), ""),
+ (KernRe(r"__flatten +"), ""),
+ (KernRe(r"__meminit +"), ""),
+ (KernRe(r"__must_check +"), ""),
+ (KernRe(r"__weak +"), ""),
+ (KernRe(r"__sched +"), ""),
+ (KernRe(r"_noprof"), ""),
+ (KernRe(r"__always_unused *"), ""),
+ (KernRe(r"__printf\s*\(\s*\d*\s*,\s*\d*\s*\) +"), ""),
+ (KernRe(r"__(?:re)?alloc_size\s*\(\s*\d+\s*(?:,\s*\d+\s*)?\) +"), ""),
+ (KernRe(r"__diagnose_as\s*\(\s*\S+\s*(?:,\s*\d+\s*)*\) +"), ""),
+ (KernRe(r"DECL_BUCKET_PARAMS\s*\(\s*(\S+)\s*,\s*(\S+)\s*\)"), r"\1, \2"),
+ (KernRe(r"__attribute_const__ +"), ""),
+ (KernRe(r"__attribute__\s*\(\((?:[\w\s]+(?:\([^)]*\))?\s*,?)+\)\)\s+"), ""),
+]
+
+#
+# Apply a set of transforms to a block of text.
+#
+def apply_transforms(xforms, text):
+ for search, subst in xforms:
+ text = search.sub(subst, text)
+ return text
+
+#
+# A little helper to get rid of excess white space
+#
+multi_space = KernRe(r'\s\s+')
+def trim_whitespace(s):
+ return multi_space.sub(' ', s.strip())
+
+#
+# Remove struct/enum members that have been marked "private".
+#
+def trim_private_members(text):
+ #
+ # First look for a "public:" block that ends a private region, then
+ # handle the "private until the end" case.
+ #
+ text = KernRe(r'/\*\s*private:.*?/\*\s*public:.*?\*/', flags=re.S).sub('', text)
+ text = KernRe(r'/\*\s*private:.*', flags=re.S).sub('', text)
+ #
+ # We needed the comments to do the above, but now we can take them out.
+ #
+ return KernRe(r'\s*/\*.*?\*/\s*', flags=re.S).sub('', text).strip()
+
+class state:
+ """
+ State machine enums
+ """
+
+ # Parser states
+ NORMAL = 0 # normal code
+ NAME = 1 # looking for function name
+ DECLARATION = 2 # We have seen a declaration which might not be done
+ BODY = 3 # the body of the comment
+ SPECIAL_SECTION = 4 # doc section ending with a blank line
+ PROTO = 5 # scanning prototype
+ DOCBLOCK = 6 # documentation block
+ INLINE_NAME = 7 # gathering doc outside main block
+ INLINE_TEXT = 8 # reading the body of inline docs
+
+ name = [
+ "NORMAL",
+ "NAME",
+ "DECLARATION",
+ "BODY",
+ "SPECIAL_SECTION",
+ "PROTO",
+ "DOCBLOCK",
+ "INLINE_NAME",
+ "INLINE_TEXT",
+ ]
+
+
+SECTION_DEFAULT = "Description" # default section
+
+class KernelEntry:
+
+ def __init__(self, config, fname, ln):
+ self.config = config
+ self.fname = fname
+
+ self._contents = []
+ self.prototype = ""
+
+ self.warnings = []
+
+ self.parameterlist = []
+ self.parameterdescs = {}
+ self.parametertypes = {}
+ self.parameterdesc_start_lines = {}
+
+ self.section_start_lines = {}
+ self.sections = {}
+
+ self.anon_struct_union = False
+
+ self.leading_space = None
+
+ self.fname = fname
+
+ # State flags
+ self.brcount = 0
+ self.declaration_start_line = ln + 1
+
+ #
+ # Management of section contents
+ #
+ def add_text(self, text):
+ self._contents.append(text)
+
+ def contents(self):
+ return '\n'.join(self._contents) + '\n'
+
+ # TODO: rename to emit_message after removal of kernel-doc.pl
+ def emit_msg(self, ln, msg, *, warning=True):
+ """Emit a message"""
+
+ log_msg = f"{self.fname}:{ln} {msg}"
+
+ if not warning:
+ self.config.log.info(log_msg)
+ return
+
+ # Delegate warning output to output logic, as this way it
+ # will report warnings/info only for symbols that are output
+
+ self.warnings.append(log_msg)
+ return
+
+ #
+ # Begin a new section.
+ #
+ def begin_section(self, line_no, title = SECTION_DEFAULT, dump = False):
+ if dump:
+ self.dump_section(start_new = True)
+ self.section = title
+ self.new_start_line = line_no
+
+ def dump_section(self, start_new=True):
+ """
+ Dumps section contents to arrays/hashes intended for that purpose.
+ """
+ #
+ # If we have accumulated no contents in the default ("description")
+ # section, don't bother.
+ #
+ if self.section == SECTION_DEFAULT and not self._contents:
+ return
+ name = self.section
+ contents = self.contents()
+
+ if type_param.match(name):
+ name = type_param.group(1)
+
+ self.parameterdescs[name] = contents
+ self.parameterdesc_start_lines[name] = self.new_start_line
+
+ self.new_start_line = 0
+
+ else:
+ if name in self.sections and self.sections[name] != "":
+ # Only warn on user-specified duplicate section names
+ if name != SECTION_DEFAULT:
+ self.emit_msg(self.new_start_line,
+ f"duplicate section name '{name}'")
+ # Treat as a new paragraph - add a blank line
+ self.sections[name] += '\n' + contents
+ else:
+ self.sections[name] = contents
+ self.section_start_lines[name] = self.new_start_line
+ self.new_start_line = 0
+
+# self.config.log.debug("Section: %s : %s", name, pformat(vars(self)))
+
+ if start_new:
+ self.section = SECTION_DEFAULT
+ self._contents = []
+
+python_warning = False
+
+class KernelDoc:
+ """
+ Read a C language source or header FILE and extract embedded
+ documentation comments.
+ """
+
+ # Section names
+
+ section_context = "Context"
+ section_return = "Return"
+
+ undescribed = "-- undescribed --"
+
+ def __init__(self, config, fname):
+ """Initialize internal variables"""
+
+ self.fname = fname
+ self.config = config
+
+ # Initial state for the state machines
+ self.state = state.NORMAL
+
+ # Store entry currently being processed
+ self.entry = None
+
+ # Place all potential outputs into an array
+ self.entries = []
+
+ #
+ # We need Python 3.7 for its "dicts remember the insertion
+ # order" guarantee
+ #
+ global python_warning
+ if (not python_warning and
+ sys.version_info.major == 3 and sys.version_info.minor < 7):
+
+ self.emit_msg(0,
+ 'Python 3.7 or later is required for correct results')
+ python_warning = True
+
+ def emit_msg(self, ln, msg, *, warning=True):
+ """Emit a message"""
+
+ if self.entry:
+ self.entry.emit_msg(ln, msg, warning=warning)
+ return
+
+ log_msg = f"{self.fname}:{ln} {msg}"
+
+ if warning:
+ self.config.log.warning(log_msg)
+ else:
+ self.config.log.info(log_msg)
+
+ def dump_section(self, start_new=True):
+ """
+ Dumps section contents to arrays/hashes intended for that purpose.
+ """
+
+ if self.entry:
+ self.entry.dump_section(start_new)
+
+ # TODO: rename it to store_declaration after removal of kernel-doc.pl
+ def output_declaration(self, dtype, name, **args):
+ """
+ Stores the entry into an entry array.
+
+ The actual output and output filters will be handled elsewhere
+ """
+
+ item = KdocItem(name, self.fname, dtype,
+ self.entry.declaration_start_line, **args)
+ item.warnings = self.entry.warnings
+
+ # Drop empty sections
+ # TODO: improve empty sections logic to emit warnings
+ sections = self.entry.sections
+ for section in ["Description", "Return"]:
+ if section in sections and not sections[section].rstrip():
+ del sections[section]
+ item.set_sections(sections, self.entry.section_start_lines)
+ item.set_params(self.entry.parameterlist, self.entry.parameterdescs,
+ self.entry.parametertypes,
+ self.entry.parameterdesc_start_lines)
+ self.entries.append(item)
+
+ self.config.log.debug("Output: %s:%s = %s", dtype, name, pformat(args))
+
+ def reset_state(self, ln):
+ """
+ Ancillary routine to create a new entry. It initializes all
+ variables used by the state machine.
+ """
+
+ #
+ # Flush the warnings out before we proceed further
+ #
+ if self.entry and self.entry not in self.entries:
+ for log_msg in self.entry.warnings:
+ self.config.log.warning(log_msg)
+
+ self.entry = KernelEntry(self.config, self.fname, ln)
+
+ # State flags
+ self.state = state.NORMAL
+
+ def push_parameter(self, ln, decl_type, param, dtype,
+ org_arg, declaration_name):
+ """
+ Store parameters and their descriptions at self.entry.
+ """
+
+ if self.entry.anon_struct_union and dtype == "" and param == "}":
+ return # Ignore the ending }; from anonymous struct/union
+
+ self.entry.anon_struct_union = False
+
+ param = KernRe(r'[\[\)].*').sub('', param, count=1)
+
+ #
+ # Look at various "anonymous type" cases.
+ #
+ if dtype == '':
+ if param.endswith("..."):
+ if len(param) > 3: # there is a name provided, use that
+ param = param[:-3]
+ if not self.entry.parameterdescs.get(param):
+ self.entry.parameterdescs[param] = "variable arguments"
+
+ elif (not param) or param == "void":
+ param = "void"
+ self.entry.parameterdescs[param] = "no arguments"
+
+ elif param in ["struct", "union"]:
+ # Handle unnamed (anonymous) union or struct
+ dtype = param
+ param = "{unnamed_" + param + "}"
+ self.entry.parameterdescs[param] = "anonymous\n"
+ self.entry.anon_struct_union = True
+
+ # Warn if parameter has no description
+ # (but ignore ones starting with # as these are not parameters
+ # but inline preprocessor statements)
+ if param not in self.entry.parameterdescs and not param.startswith("#"):
+ self.entry.parameterdescs[param] = self.undescribed
+
+ if "." not in param:
+ if decl_type == 'function':
+ dname = f"{decl_type} parameter"
+ else:
+ dname = f"{decl_type} member"
+
+ self.emit_msg(ln,
+ f"{dname} '{param}' not described in '{declaration_name}'")
+
+ # Strip spaces from param so that it is one continuous string on
+ # parameterlist. This fixes a problem where check_sections()
+ # cannot find a parameter like "addr[6 + 2]" because it actually
+ # appears as "addr[6", "+", "2]" on the parameter list.
+ # However, it's better to maintain the param string unchanged for
+ # output, so just weaken the string compare in check_sections()
+ # to ignore "[blah" in a parameter string.
+
+ self.entry.parameterlist.append(param)
+ org_arg = KernRe(r'\s\s+').sub(' ', org_arg)
+ self.entry.parametertypes[param] = org_arg
+
+
+ def create_parameter_list(self, ln, decl_type, args,
+ splitter, declaration_name):
+ """
+ Creates a list of parameters, storing them at self.entry.
+ """
+
+ # temporarily replace all commas inside function pointer definition
+ arg_expr = KernRe(r'(\([^\),]+),')
+ while arg_expr.search(args):
+ args = arg_expr.sub(r"\1#", args)
+
+ for arg in args.split(splitter):
+ # Ignore argument attributes
+ arg = KernRe(r'\sPOS0?\s').sub(' ', arg)
+
+ # Strip leading/trailing spaces
+ arg = arg.strip()
+ arg = KernRe(r'\s+').sub(' ', arg, count=1)
+
+ if arg.startswith('#'):
+ # Treat preprocessor directive as a typeless variable just to fill
+ # corresponding data structures "correctly". Catch it later in
+ # output_* subs.
+
+ # Treat preprocessor directive as a typeless variable
+ self.push_parameter(ln, decl_type, arg, "",
+ "", declaration_name)
+ #
+ # The pointer-to-function case.
+ #
+ elif KernRe(r'\(.+\)\s*\(').search(arg):
+ arg = arg.replace('#', ',')
+ r = KernRe(r'[^\(]+\(\*?\s*' # Everything up to "(*"
+ r'([\w\[\].]*)' # Capture the name and possible [array]
+ r'\s*\)') # Make sure the trailing ")" is there
+ if r.match(arg):
+ param = r.group(1)
+ else:
+ self.emit_msg(ln, f"Invalid param: {arg}")
+ param = arg
+ dtype = arg.replace(param, '')
+ self.push_parameter(ln, decl_type, param, dtype, arg, declaration_name)
+ #
+ # The array-of-pointers case. Dig the parameter name out from the middle
+ # of the declaration.
+ #
+ elif KernRe(r'\(.+\)\s*\[').search(arg):
+ r = KernRe(r'[^\(]+\(\s*\*\s*' # Up to "(" and maybe "*"
+ r'([\w.]*?)' # The actual pointer name
+ r'\s*(\[\s*\w+\s*\]\s*)*\)') # The [array portion]
+ if r.match(arg):
+ param = r.group(1)
+ else:
+ self.emit_msg(ln, f"Invalid param: {arg}")
+ param = arg
+ dtype = arg.replace(param, '')
+ self.push_parameter(ln, decl_type, param, dtype, arg, declaration_name)
+ elif arg:
+ #
+ # Clean up extraneous spaces and split the string at commas; the first
+ # element of the resulting list will also include the type information.
+ #
+ arg = KernRe(r'\s*:\s*').sub(":", arg)
+ arg = KernRe(r'\s*\[').sub('[', arg)
+ args = KernRe(r'\s*,\s*').split(arg)
+ args[0] = re.sub(r'(\*+)\s*', r' \1', args[0])
+ #
+ # args[0] has a string of "type a". If "a" includes an [array]
+ # declaration, we want to not be fooled by any white space inside
+ # the brackets, so detect and handle that case specially.
+ #
+ r = KernRe(r'^([^[\]]*\s+)(.*)$')
+ if r.match(args[0]):
+ args[0] = r.group(2)
+ dtype = r.group(1)
+ else:
+ # No space in args[0]; this seems wrong but preserves previous behavior
+ dtype = ''
+
+ bitfield_re = KernRe(r'(.*?):(\w+)')
+ for param in args:
+ #
+ # For pointers, shift the star(s) from the variable name to the
+ # type declaration.
+ #
+ r = KernRe(r'^(\*+)\s*(.*)')
+ if r.match(param):
+ self.push_parameter(ln, decl_type, r.group(2),
+ f"{dtype} {r.group(1)}",
+ arg, declaration_name)
+ #
+ # Perform a similar shift for bitfields.
+ #
+ elif bitfield_re.search(param):
+ if dtype != "": # Skip unnamed bit-fields
+ self.push_parameter(ln, decl_type, bitfield_re.group(1),
+ f"{dtype}:{bitfield_re.group(2)}",
+ arg, declaration_name)
+ else:
+ self.push_parameter(ln, decl_type, param, dtype,
+ arg, declaration_name)
+
+ def check_sections(self, ln, decl_name, decl_type):
+ """
+ Check for errors inside sections, emitting warnings if not found
+ parameters are described.
+ """
+ for section in self.entry.sections:
+ if section not in self.entry.parameterlist and \
+ not known_sections.search(section):
+ if decl_type == 'function':
+ dname = f"{decl_type} parameter"
+ else:
+ dname = f"{decl_type} member"
+ self.emit_msg(ln,
+ f"Excess {dname} '{section}' description in '{decl_name}'")
+
+ def check_return_section(self, ln, declaration_name, return_type):
+ """
+ If the function doesn't return void, warns about the lack of a
+ return description.
+ """
+
+ if not self.config.wreturn:
+ return
+
+ # Ignore an empty return type (It's a macro)
+ # Ignore functions with a "void" return type (but not "void *")
+ if not return_type or KernRe(r'void\s*\w*\s*$').search(return_type):
+ return
+
+ if not self.entry.sections.get("Return", None):
+ self.emit_msg(ln,
+ f"No description found for return value of '{declaration_name}'")
+
+ #
+ # Split apart a structure prototype; returns (struct|union, name, members) or None
+ #
+ def split_struct_proto(self, proto):
+ type_pattern = r'(struct|union)'
+ qualifiers = [
+ "__attribute__",
+ "__packed",
+ "__aligned",
+ "____cacheline_aligned_in_smp",
+ "____cacheline_aligned",
+ ]
+ definition_body = r'\{(.*)\}\s*' + "(?:" + '|'.join(qualifiers) + ")?"
+
+ r = KernRe(type_pattern + r'\s+(\w+)\s*' + definition_body)
+ if r.search(proto):
+ return (r.group(1), r.group(2), r.group(3))
+ else:
+ r = KernRe(r'typedef\s+' + type_pattern + r'\s*' + definition_body + r'\s*(\w+)\s*;')
+ if r.search(proto):
+ return (r.group(1), r.group(3), r.group(2))
+ return None
+ #
+ # Rewrite the members of a structure or union for easier formatting later on.
+ # Among other things, this function will turn a member like:
+ #
+ # struct { inner_members; } foo;
+ #
+ # into:
+ #
+ # struct foo; inner_members;
+ #
+ def rewrite_struct_members(self, members):
+ #
+ # Process struct/union members from the most deeply nested outward. The
+ # trick is in the ^{ below - it prevents a match of an outer struct/union
+ # until the inner one has been munged (removing the "{" in the process).
+ #
+ struct_members = KernRe(r'(struct|union)' # 0: declaration type
+ r'([^\{\};]+)' # 1: possible name
+ r'(\{)'
+ r'([^\{\}]*)' # 3: Contents of declaration
+ r'(\})'
+ r'([^\{\};]*)(;)') # 5: Remaining stuff after declaration
+ tuples = struct_members.findall(members)
+ while tuples:
+ for t in tuples:
+ newmember = ""
+ oldmember = "".join(t) # Reconstruct the original formatting
+ dtype, name, lbr, content, rbr, rest, semi = t
+ #
+ # Pass through each field name, normalizing the form and formatting.
+ #
+ for s_id in rest.split(','):
+ s_id = s_id.strip()
+ newmember += f"{dtype} {s_id}; "
+ #
+ # Remove bitfield/array/pointer info, getting the bare name.
+ #
+ s_id = KernRe(r'[:\[].*').sub('', s_id)
+ s_id = KernRe(r'^\s*\**(\S+)\s*').sub(r'\1', s_id)
+ #
+ # Pass through the members of this inner structure/union.
+ #
+ for arg in content.split(';'):
+ arg = arg.strip()
+ #
+ # Look for (type)(*name)(args) - pointer to function
+ #
+ r = KernRe(r'^([^\(]+\(\*?\s*)([\w.]*)(\s*\).*)')
+ if r.match(arg):
+ dtype, name, extra = r.group(1), r.group(2), r.group(3)
+ # Pointer-to-function
+ if not s_id:
+ # Anonymous struct/union
+ newmember += f"{dtype}{name}{extra}; "
+ else:
+ newmember += f"{dtype}{s_id}.{name}{extra}; "
+ #
+ # Otherwise a non-function member.
+ #
+ else:
+ #
+ # Remove bitmap and array portions and spaces around commas
+ #
+ arg = KernRe(r':\s*\d+\s*').sub('', arg)
+ arg = KernRe(r'\[.*\]').sub('', arg)
+ arg = KernRe(r'\s*,\s*').sub(',', arg)
+ #
+ # Look for a normal decl - "type name[,name...]"
+ #
+ r = KernRe(r'(.*)\s+([\S+,]+)')
+ if r.search(arg):
+ for name in r.group(2).split(','):
+ name = KernRe(r'^\s*\**(\S+)\s*').sub(r'\1', name)
+ if not s_id:
+ # Anonymous struct/union
+ newmember += f"{r.group(1)} {name}; "
+ else:
+ newmember += f"{r.group(1)} {s_id}.{name}; "
+ else:
+ newmember += f"{arg}; "
+ #
+ # At the end of the s_id loop, replace the original declaration with
+ # the munged version.
+ #
+ members = members.replace(oldmember, newmember)
+ #
+ # End of the tuple loop - search again and see if there are outer members
+ # that now turn up.
+ #
+ tuples = struct_members.findall(members)
+ return members
+
+ #
+ # Format the struct declaration into a standard form for inclusion in the
+ # resulting docs.
+ #
+ def format_struct_decl(self, declaration):
+ #
+ # Insert newlines, get rid of extra spaces.
+ #
+ declaration = KernRe(r'([\{;])').sub(r'\1\n', declaration)
+ declaration = KernRe(r'\}\s+;').sub('};', declaration)
+ #
+ # Format inline enums with each member on its own line.
+ #
+ r = KernRe(r'(enum\s+\{[^\}]+),([^\n])')
+ while r.search(declaration):
+ declaration = r.sub(r'\1,\n\2', declaration)
+ #
+ # Now go through and supply the right number of tabs
+ # for each line.
+ #
+ def_args = declaration.split('\n')
+ level = 1
+ declaration = ""
+ for clause in def_args:
+ clause = KernRe(r'\s+').sub(' ', clause.strip(), count=1)
+ if clause:
+ if '}' in clause and level > 1:
+ level -= 1
+ if not clause.startswith('#'):
+ declaration += "\t" * level
+ declaration += "\t" + clause + "\n"
+ if "{" in clause and "}" not in clause:
+ level += 1
+ return declaration
+
+
+ def dump_struct(self, ln, proto):
+ """
+ Store an entry for a struct or union
+ """
+ #
+ # Do the basic parse to get the pieces of the declaration.
+ #
+ struct_parts = self.split_struct_proto(proto)
+ if not struct_parts:
+ self.emit_msg(ln, f"{proto} error: Cannot parse struct or union!")
+ return
+ decl_type, declaration_name, members = struct_parts
+
+ if self.entry.identifier != declaration_name:
+ self.emit_msg(ln, f"expecting prototype for {decl_type} {self.entry.identifier}. "
+ f"Prototype was for {decl_type} {declaration_name} instead\n")
+ return
+ #
+ # Go through the list of members applying all of our transformations.
+ #
+ members = trim_private_members(members)
+ members = apply_transforms(struct_xforms, members)
+
+ nested = NestedMatch()
+ for search, sub in struct_nested_prefixes:
+ members = nested.sub(search, sub, members)
+ #
+ # Deal with embedded struct and union members, and drop enums entirely.
+ #
+ declaration = members
+ members = self.rewrite_struct_members(members)
+ members = re.sub(r'(\{[^\{\}]*\})', '', members)
+ #
+ # Output the result and we are done.
+ #
+ self.create_parameter_list(ln, decl_type, members, ';',
+ declaration_name)
+ self.check_sections(ln, declaration_name, decl_type)
+ self.output_declaration(decl_type, declaration_name,
+ definition=self.format_struct_decl(declaration),
+ purpose=self.entry.declaration_purpose)
+
+ def dump_enum(self, ln, proto):
+ """
+ Stores an enum inside self.entries array.
+ """
+ #
+ # Strip preprocessor directives. Note that this depends on the
+ # trailing semicolon we added in process_proto_type().
+ #
+ proto = KernRe(r'#\s*((define|ifdef|if)\s+|endif)[^;]*;', flags=re.S).sub('', proto)
+ #
+ # Parse out the name and members of the enum. Typedef form first.
+ #
+ r = KernRe(r'typedef\s+enum\s*\{(.*)\}\s*(\w*)\s*;')
+ if r.search(proto):
+ declaration_name = r.group(2)
+ members = trim_private_members(r.group(1))
+ #
+ # Failing that, look for a straight enum
+ #
+ else:
+ r = KernRe(r'enum\s+(\w*)\s*\{(.*)\}')
+ if r.match(proto):
+ declaration_name = r.group(1)
+ members = trim_private_members(r.group(2))
+ #
+ # OK, this isn't going to work.
+ #
+ else:
+ self.emit_msg(ln, f"{proto}: error: Cannot parse enum!")
+ return
+ #
+ # Make sure we found what we were expecting.
+ #
+ if self.entry.identifier != declaration_name:
+ if self.entry.identifier == "":
+ self.emit_msg(ln,
+ f"{proto}: wrong kernel-doc identifier on prototype")
+ else:
+ self.emit_msg(ln,
+ f"expecting prototype for enum {self.entry.identifier}. "
+ f"Prototype was for enum {declaration_name} instead")
+ return
+
+ if not declaration_name:
+ declaration_name = "(anonymous)"
+ #
+ # Parse out the name of each enum member, and verify that we
+ # have a description for it.
+ #
+ member_set = set()
+ members = KernRe(r'\([^;)]*\)').sub('', members)
+ for arg in members.split(','):
+ if not arg:
+ continue
+ arg = KernRe(r'^\s*(\w+).*').sub(r'\1', arg)
+ self.entry.parameterlist.append(arg)
+ if arg not in self.entry.parameterdescs:
+ self.entry.parameterdescs[arg] = self.undescribed
+ self.emit_msg(ln,
+ f"Enum value '{arg}' not described in enum '{declaration_name}'")
+ member_set.add(arg)
+ #
+ # Ensure that every described member actually exists in the enum.
+ #
+ for k in self.entry.parameterdescs:
+ if k not in member_set:
+ self.emit_msg(ln,
+ f"Excess enum value '@{k}' description in '{declaration_name}'")
+
+ self.output_declaration('enum', declaration_name,
+ purpose=self.entry.declaration_purpose)
+
+ def dump_declaration(self, ln, prototype):
+ """
+ Stores a data declaration inside self.entries array.
+ """
+
+ if self.entry.decl_type == "enum":
+ self.dump_enum(ln, prototype)
+ elif self.entry.decl_type == "typedef":
+ self.dump_typedef(ln, prototype)
+ elif self.entry.decl_type in ["union", "struct"]:
+ self.dump_struct(ln, prototype)
+ else:
+ # This would be a bug
+ self.emit_message(ln, f'Unknown declaration type: {self.entry.decl_type}')
+
+ def dump_function(self, ln, prototype):
+ """
+ Stores a function or function macro inside self.entries array.
+ """
+
+ found = func_macro = False
+ return_type = ''
+ decl_type = 'function'
+ #
+ # Apply the initial transformations.
+ #
+ prototype = apply_transforms(function_xforms, prototype)
+ #
+ # If we have a macro, remove the "#define" at the front.
+ #
+ new_proto = KernRe(r"^#\s*define\s+").sub("", prototype)
+ if new_proto != prototype:
+ prototype = new_proto
+ #
+ # Dispense with the simple "#define A B" case here; the key
+ # is the space after the name of the symbol being defined.
+ # NOTE that the seemingly misnamed "func_macro" indicates a
+ # macro *without* arguments.
+ #
+ r = KernRe(r'^(\w+)\s+')
+ if r.search(prototype):
+ return_type = ''
+ declaration_name = r.group(1)
+ func_macro = True
+ found = True
+
+ # Yes, this truly is vile. We are looking for:
+ # 1. Return type (may be nothing if we're looking at a macro)
+ # 2. Function name
+ # 3. Function parameters.
+ #
+ # All the while we have to watch out for function pointer parameters
+ # (which IIRC is what the two sections are for), C types (these
+ # regexps don't even start to express all the possibilities), and
+ # so on.
+ #
+ # If you mess with these regexps, it's a good idea to check that
+ # the following functions' documentation still comes out right:
+ # - parport_register_device (function pointer parameters)
+ # - atomic_set (macro)
+ # - pci_match_device, __copy_to_user (long return type)
+
+ name = r'\w+'
+ type1 = r'(?:[\w\s]+)?'
+ type2 = r'(?:[\w\s]+\*+)+'
+ #
+ # Attempt to match first on (args) with no internal parentheses; this
+ # lets us easily filter out __acquires() and other post-args stuff. If
+ # that fails, just grab the rest of the line to the last closing
+ # parenthesis.
+ #
+ proto_args = r'\(([^\(]*|.*)\)'
+ #
+ # (Except for the simple macro case) attempt to split up the prototype
+ # in the various ways we understand.
+ #
+ if not found:
+ patterns = [
+ rf'^()({name})\s*{proto_args}',
+ rf'^({type1})\s+({name})\s*{proto_args}',
+ rf'^({type2})\s*({name})\s*{proto_args}',
+ ]
+
+ for p in patterns:
+ r = KernRe(p)
+ if r.match(prototype):
+ return_type = r.group(1)
+ declaration_name = r.group(2)
+ args = r.group(3)
+ self.create_parameter_list(ln, decl_type, args, ',',
+ declaration_name)
+ found = True
+ break
+ #
+ # Parsing done; make sure that things are as we expect.
+ #
+ if not found:
+ self.emit_msg(ln,
+ f"cannot understand function prototype: '{prototype}'")
+ return
+ if self.entry.identifier != declaration_name:
+ self.emit_msg(ln, f"expecting prototype for {self.entry.identifier}(). "
+ f"Prototype was for {declaration_name}() instead")
+ return
+ self.check_sections(ln, declaration_name, "function")
+ self.check_return_section(ln, declaration_name, return_type)
+ #
+ # Store the result.
+ #
+ self.output_declaration(decl_type, declaration_name,
+ typedef=('typedef' in return_type),
+ functiontype=return_type,
+ purpose=self.entry.declaration_purpose,
+ func_macro=func_macro)
+
+
+ def dump_typedef(self, ln, proto):
+ """
+ Stores a typedef inside self.entries array.
+ """
+ #
+ # We start by looking for function typedefs.
+ #
+ typedef_type = r'typedef((?:\s+[\w*]+\b){0,7}\s+(?:\w+\b|\*+))\s*'
+ typedef_ident = r'\*?\s*(\w\S+)\s*'
+ typedef_args = r'\s*\((.*)\);'
+
+ typedef1 = KernRe(typedef_type + r'\(' + typedef_ident + r'\)' + typedef_args)
+ typedef2 = KernRe(typedef_type + typedef_ident + typedef_args)
+
+ # Parse function typedef prototypes
+ for r in [typedef1, typedef2]:
+ if not r.match(proto):
+ continue
+
+ return_type = r.group(1).strip()
+ declaration_name = r.group(2)
+ args = r.group(3)
+
+ if self.entry.identifier != declaration_name:
+ self.emit_msg(ln,
+ f"expecting prototype for typedef {self.entry.identifier}. Prototype was for typedef {declaration_name} instead\n")
+ return
+
+ self.create_parameter_list(ln, 'function', args, ',', declaration_name)
+
+ self.output_declaration('function', declaration_name,
+ typedef=True,
+ functiontype=return_type,
+ purpose=self.entry.declaration_purpose)
+ return
+ #
+ # Not a function, try to parse a simple typedef.
+ #
+ r = KernRe(r'typedef.*\s+(\w+)\s*;')
+ if r.match(proto):
+ declaration_name = r.group(1)
+
+ if self.entry.identifier != declaration_name:
+ self.emit_msg(ln,
+ f"expecting prototype for typedef {self.entry.identifier}. Prototype was for typedef {declaration_name} instead\n")
+ return
+
+ self.output_declaration('typedef', declaration_name,
+ purpose=self.entry.declaration_purpose)
+ return
+
+ self.emit_msg(ln, "error: Cannot parse typedef!")
+
+ @staticmethod
+ def process_export(function_set, line):
+ """
+ process EXPORT_SYMBOL* tags
+
+ This method doesn't use any variable from the class, so declare it
+ with a staticmethod decorator.
+ """
+
+ # We support documenting some exported symbols with different
+ # names. A horrible hack.
+ suffixes = [ '_noprof' ]
+
+ # Note: it accepts only one EXPORT_SYMBOL* per line, as having
+ # multiple export lines would violate Kernel coding style.
+
+ if export_symbol.search(line):
+ symbol = export_symbol.group(2)
+ elif export_symbol_ns.search(line):
+ symbol = export_symbol_ns.group(2)
+ else:
+ return False
+ #
+ # Found an export, trim out any special suffixes
+ #
+ for suffix in suffixes:
+ # Be backward compatible with Python < 3.9
+ if symbol.endswith(suffix):
+ symbol = symbol[:-len(suffix)]
+ function_set.add(symbol)
+ return True
+
+ def process_normal(self, ln, line):
+ """
+ STATE_NORMAL: looking for the /** to begin everything.
+ """
+
+ if not doc_start.match(line):
+ return
+
+ # start a new entry
+ self.reset_state(ln)
+
+ # next line is always the function name
+ self.state = state.NAME
+
+ def process_name(self, ln, line):
+ """
+ STATE_NAME: Looking for the "name - description" line
+ """
+ #
+ # Check for a DOC: block and handle them specially.
+ #
+ if doc_block.search(line):
+
+ if not doc_block.group(1):
+ self.entry.begin_section(ln, "Introduction")
+ else:
+ self.entry.begin_section(ln, doc_block.group(1))
+
+ self.entry.identifier = self.entry.section
+ self.state = state.DOCBLOCK
+ #
+ # Otherwise we're looking for a normal kerneldoc declaration line.
+ #
+ elif doc_decl.search(line):
+ self.entry.identifier = doc_decl.group(1)
+
+ # Test for data declaration
+ if doc_begin_data.search(line):
+ self.entry.decl_type = doc_begin_data.group(1)
+ self.entry.identifier = doc_begin_data.group(2)
+ #
+ # Look for a function description
+ #
+ elif doc_begin_func.search(line):
+ self.entry.identifier = doc_begin_func.group(1)
+ self.entry.decl_type = "function"
+ #
+ # We struck out.
+ #
+ else:
+ self.emit_msg(ln,
+ f"This comment starts with '/**', but isn't a kernel-doc comment. Refer to Documentation/doc-guide/kernel-doc.rst\n{line}")
+ self.state = state.NORMAL
+ return
+ #
+ # OK, set up for a new kerneldoc entry.
+ #
+ self.state = state.BODY
+ self.entry.identifier = self.entry.identifier.strip(" ")
+ # if there's no @param blocks need to set up default section here
+ self.entry.begin_section(ln + 1)
+ #
+ # Find the description portion, which *should* be there but
+ # isn't always.
+ # (We should be able to capture this from the previous parsing - someday)
+ #
+ r = KernRe("[-:](.*)")
+ if r.search(line):
+ self.entry.declaration_purpose = trim_whitespace(r.group(1))
+ self.state = state.DECLARATION
+ else:
+ self.entry.declaration_purpose = ""
+
+ if not self.entry.declaration_purpose and self.config.wshort_desc:
+ self.emit_msg(ln,
+ f"missing initial short description on line:\n{line}")
+
+ if not self.entry.identifier and self.entry.decl_type != "enum":
+ self.emit_msg(ln,
+ f"wrong kernel-doc identifier on line:\n{line}")
+ self.state = state.NORMAL
+
+ if self.config.verbose:
+ self.emit_msg(ln,
+ f"Scanning doc for {self.entry.decl_type} {self.entry.identifier}",
+ warning=False)
+ #
+ # Failed to find an identifier. Emit a warning
+ #
+ else:
+ self.emit_msg(ln, f"Cannot find identifier on line:\n{line}")
+
+ #
+ # Helper function to determine if a new section is being started.
+ #
+ def is_new_section(self, ln, line):
+ if doc_sect.search(line):
+ self.state = state.BODY
+ #
+ # Pick out the name of our new section, tweaking it if need be.
+ #
+ newsection = doc_sect.group(1)
+ if newsection.lower() == 'description':
+ newsection = 'Description'
+ elif newsection.lower() == 'context':
+ newsection = 'Context'
+ self.state = state.SPECIAL_SECTION
+ elif newsection.lower() in ["@return", "@returns",
+ "return", "returns"]:
+ newsection = "Return"
+ self.state = state.SPECIAL_SECTION
+ elif newsection[0] == '@':
+ self.state = state.SPECIAL_SECTION
+ #
+ # Initialize the contents, and get the new section going.
+ #
+ newcontents = doc_sect.group(2)
+ if not newcontents:
+ newcontents = ""
+ self.dump_section()
+ self.entry.begin_section(ln, newsection)
+ self.entry.leading_space = None
+
+ self.entry.add_text(newcontents.lstrip())
+ return True
+ return False
+
+ #
+ # Helper function to detect (and effect) the end of a kerneldoc comment.
+ #
+ def is_comment_end(self, ln, line):
+ if doc_end.search(line):
+ self.dump_section()
+
+ # Look for doc_com + <text> + doc_end:
+ r = KernRe(r'\s*\*\s*[a-zA-Z_0-9:.]+\*/')
+ if r.match(line):
+ self.emit_msg(ln, f"suspicious ending line: {line}")
+
+ self.entry.prototype = ""
+ self.entry.new_start_line = ln + 1
+
+ self.state = state.PROTO
+ return True
+ return False
+
+
+ def process_decl(self, ln, line):
+ """
+ STATE_DECLARATION: We've seen the beginning of a declaration
+ """
+ if self.is_new_section(ln, line) or self.is_comment_end(ln, line):
+ return
+ #
+ # Look for anything with the " * " line beginning.
+ #
+ if doc_content.search(line):
+ cont = doc_content.group(1)
+ #
+ # A blank line means that we have moved out of the declaration
+ # part of the comment (without any "special section" parameter
+ # descriptions).
+ #
+ if cont == "":
+ self.state = state.BODY
+ #
+ # Otherwise we have more of the declaration section to soak up.
+ #
+ else:
+ self.entry.declaration_purpose = \
+ trim_whitespace(self.entry.declaration_purpose + ' ' + cont)
+ else:
+ # Unknown line, ignore
+ self.emit_msg(ln, f"bad line: {line}")
+
+
+ def process_special(self, ln, line):
+ """
+ STATE_SPECIAL_SECTION: a section ending with a blank line
+ """
+ #
+ # If we have hit a blank line (only the " * " marker), then this
+ # section is done.
+ #
+ if KernRe(r"\s*\*\s*$").match(line):
+ self.entry.begin_section(ln, dump = True)
+ self.state = state.BODY
+ return
+ #
+ # Not a blank line, look for the other ways to end the section.
+ #
+ if self.is_new_section(ln, line) or self.is_comment_end(ln, line):
+ return
+ #
+ # OK, we should have a continuation of the text for this section.
+ #
+ if doc_content.search(line):
+ cont = doc_content.group(1)
+ #
+ # If the lines of text after the first in a special section have
+ # leading white space, we need to trim it out or Sphinx will get
+ # confused. For the second line (the None case), see what we
+ # find there and remember it.
+ #
+ if self.entry.leading_space is None:
+ r = KernRe(r'^(\s+)')
+ if r.match(cont):
+ self.entry.leading_space = len(r.group(1))
+ else:
+ self.entry.leading_space = 0
+ #
+ # Otherwise, before trimming any leading chars, be *sure*
+ # that they are white space. We should maybe warn if this
+ # isn't the case.
+ #
+ for i in range(0, self.entry.leading_space):
+ if cont[i] != " ":
+ self.entry.leading_space = i
+ break
+ #
+ # Add the trimmed result to the section and we're done.
+ #
+ self.entry.add_text(cont[self.entry.leading_space:])
+ else:
+ # Unknown line, ignore
+ self.emit_msg(ln, f"bad line: {line}")
+
+ def process_body(self, ln, line):
+ """
+ STATE_BODY: the bulk of a kerneldoc comment.
+ """
+ if self.is_new_section(ln, line) or self.is_comment_end(ln, line):
+ return
+
+ if doc_content.search(line):
+ cont = doc_content.group(1)
+ self.entry.add_text(cont)
+ else:
+ # Unknown line, ignore
+ self.emit_msg(ln, f"bad line: {line}")
+
+ def process_inline_name(self, ln, line):
+ """STATE_INLINE_NAME: beginning of docbook comments within a prototype."""
+
+ if doc_inline_sect.search(line):
+ self.entry.begin_section(ln, doc_inline_sect.group(1))
+ self.entry.add_text(doc_inline_sect.group(2).lstrip())
+ self.state = state.INLINE_TEXT
+ elif doc_inline_end.search(line):
+ self.dump_section()
+ self.state = state.PROTO
+ elif doc_content.search(line):
+ self.emit_msg(ln, f"Incorrect use of kernel-doc format: {line}")
+ self.state = state.PROTO
+ # else ... ??
+
+ def process_inline_text(self, ln, line):
+ """STATE_INLINE_TEXT: docbook comments within a prototype."""
+
+ if doc_inline_end.search(line):
+ self.dump_section()
+ self.state = state.PROTO
+ elif doc_content.search(line):
+ self.entry.add_text(doc_content.group(1))
+ # else ... ??
+
+ def syscall_munge(self, ln, proto): # pylint: disable=W0613
+ """
+ Handle syscall definitions
+ """
+
+ is_void = False
+
+ # Strip newlines/CR's
+ proto = re.sub(r'[\r\n]+', ' ', proto)
+
+ # Check if it's a SYSCALL_DEFINE0
+ if 'SYSCALL_DEFINE0' in proto:
+ is_void = True
+
+ # Replace SYSCALL_DEFINE with correct return type & function name
+ proto = KernRe(r'SYSCALL_DEFINE.*\(').sub('long sys_', proto)
+
+ r = KernRe(r'long\s+(sys_.*?),')
+ if r.search(proto):
+ proto = KernRe(',').sub('(', proto, count=1)
+ elif is_void:
+ proto = KernRe(r'\)').sub('(void)', proto, count=1)
+
+ # Now delete all of the odd-numbered commas in the proto
+ # so that argument types & names don't have a comma between them
+ count = 0
+ length = len(proto)
+
+ if is_void:
+ length = 0 # skip the loop if is_void
+
+ for ix in range(length):
+ if proto[ix] == ',':
+ count += 1
+ if count % 2 == 1:
+ proto = proto[:ix] + ' ' + proto[ix + 1:]
+
+ return proto
+
+ def tracepoint_munge(self, ln, proto):
+ """
+ Handle tracepoint definitions
+ """
+
+ tracepointname = None
+ tracepointargs = None
+
+ # Match tracepoint name based on different patterns
+ r = KernRe(r'TRACE_EVENT\((.*?),')
+ if r.search(proto):
+ tracepointname = r.group(1)
+
+ r = KernRe(r'DEFINE_SINGLE_EVENT\((.*?),')
+ if r.search(proto):
+ tracepointname = r.group(1)
+
+ r = KernRe(r'DEFINE_EVENT\((.*?),(.*?),')
+ if r.search(proto):
+ tracepointname = r.group(2)
+
+ if tracepointname:
+ tracepointname = tracepointname.lstrip()
+
+ r = KernRe(r'TP_PROTO\((.*?)\)')
+ if r.search(proto):
+ tracepointargs = r.group(1)
+
+ if not tracepointname or not tracepointargs:
+ self.emit_msg(ln,
+ f"Unrecognized tracepoint format:\n{proto}\n")
+ else:
+ proto = f"static inline void trace_{tracepointname}({tracepointargs})"
+ self.entry.identifier = f"trace_{self.entry.identifier}"
+
+ return proto
+
+ def process_proto_function(self, ln, line):
+ """Ancillary routine to process a function prototype"""
+
+ # strip C99-style comments to end of line
+ line = KernRe(r"//.*$", re.S).sub('', line)
+ #
+ # Soak up the line's worth of prototype text, stopping at { or ; if present.
+ #
+ if KernRe(r'\s*#\s*define').match(line):
+ self.entry.prototype = line
+ elif not line.startswith('#'): # skip other preprocessor stuff
+ r = KernRe(r'([^\{]*)')
+ if r.match(line):
+ self.entry.prototype += r.group(1) + " "
+ #
+ # If we now have the whole prototype, clean it up and declare victory.
+ #
+ if '{' in line or ';' in line or KernRe(r'\s*#\s*define').match(line):
+ # strip comments and surrounding spaces
+ self.entry.prototype = KernRe(r'/\*.*\*/').sub('', self.entry.prototype).strip()
+ #
+ # Handle self.entry.prototypes for function pointers like:
+ # int (*pcs_config)(struct foo)
+ # by turning it into
+ # int pcs_config(struct foo)
+ #
+ r = KernRe(r'^(\S+\s+)\(\s*\*(\S+)\)')
+ self.entry.prototype = r.sub(r'\1\2', self.entry.prototype)
+ #
+ # Handle special declaration syntaxes
+ #
+ if 'SYSCALL_DEFINE' in self.entry.prototype:
+ self.entry.prototype = self.syscall_munge(ln,
+ self.entry.prototype)
+ else:
+ r = KernRe(r'TRACE_EVENT|DEFINE_EVENT|DEFINE_SINGLE_EVENT')
+ if r.search(self.entry.prototype):
+ self.entry.prototype = self.tracepoint_munge(ln,
+ self.entry.prototype)
+ #
+ # ... and we're done
+ #
+ self.dump_function(ln, self.entry.prototype)
+ self.reset_state(ln)
+
+ def process_proto_type(self, ln, line):
+ """Ancillary routine to process a type"""
+
+ # Strip C99-style comments and surrounding whitespace
+ line = KernRe(r"//.*$", re.S).sub('', line).strip()
+ if not line:
+ return # nothing to see here
+
+ # To distinguish preprocessor directive from regular declaration later.
+ if line.startswith('#'):
+ line += ";"
+ #
+ # Split the declaration on any of { } or ;, and accumulate pieces
+ # until we hit a semicolon while not inside {brackets}
+ #
+ r = KernRe(r'(.*?)([{};])')
+ for chunk in r.split(line):
+ if chunk: # Ignore empty matches
+ self.entry.prototype += chunk
+ #
+ # This cries out for a match statement ... someday after we can
+ # drop Python 3.9 ...
+ #
+ if chunk == '{':
+ self.entry.brcount += 1
+ elif chunk == '}':
+ self.entry.brcount -= 1
+ elif chunk == ';' and self.entry.brcount <= 0:
+ self.dump_declaration(ln, self.entry.prototype)
+ self.reset_state(ln)
+ return
+ #
+ # We hit the end of the line while still in the declaration; put
+ # in a space to represent the newline.
+ #
+ self.entry.prototype += ' '
+
+ def process_proto(self, ln, line):
+ """STATE_PROTO: reading a function/whatever prototype."""
+
+ if doc_inline_oneline.search(line):
+ self.entry.begin_section(ln, doc_inline_oneline.group(1))
+ self.entry.add_text(doc_inline_oneline.group(2))
+ self.dump_section()
+
+ elif doc_inline_start.search(line):
+ self.state = state.INLINE_NAME
+
+ elif self.entry.decl_type == 'function':
+ self.process_proto_function(ln, line)
+
+ else:
+ self.process_proto_type(ln, line)
+
+ def process_docblock(self, ln, line):
+ """STATE_DOCBLOCK: within a DOC: block."""
+
+ if doc_end.search(line):
+ self.dump_section()
+ self.output_declaration("doc", self.entry.identifier)
+ self.reset_state(ln)
+
+ elif doc_content.search(line):
+ self.entry.add_text(doc_content.group(1))
+
+ def parse_export(self):
+ """
+ Parses EXPORT_SYMBOL* macros from a single Kernel source file.
+ """
+
+ export_table = set()
+
+ try:
+ with open(self.fname, "r", encoding="utf8",
+ errors="backslashreplace") as fp:
+
+ for line in fp:
+ self.process_export(export_table, line)
+
+ except IOError:
+ return None
+
+ return export_table
+
+ #
+ # The state/action table telling us which function to invoke in
+ # each state.
+ #
+ state_actions = {
+ state.NORMAL: process_normal,
+ state.NAME: process_name,
+ state.BODY: process_body,
+ state.DECLARATION: process_decl,
+ state.SPECIAL_SECTION: process_special,
+ state.INLINE_NAME: process_inline_name,
+ state.INLINE_TEXT: process_inline_text,
+ state.PROTO: process_proto,
+ state.DOCBLOCK: process_docblock,
+ }
+
+ def parse_kdoc(self):
+ """
+ Open and process each line of a C source file.
+ The parsing is controlled via a state machine, and the line is passed
+ to a different process function depending on the state. The process
+ function may update the state as needed.
+
+ Besides parsing kernel-doc tags, it also parses export symbols.
+ """
+
+ prev = ""
+ prev_ln = None
+ export_table = set()
+
+ try:
+ with open(self.fname, "r", encoding="utf8",
+ errors="backslashreplace") as fp:
+ for ln, line in enumerate(fp):
+
+ line = line.expandtabs().strip("\n")
+
+ # Group continuation lines on prototypes
+ if self.state == state.PROTO:
+ if line.endswith("\\"):
+ prev += line.rstrip("\\")
+ if not prev_ln:
+ prev_ln = ln
+ continue
+
+ if prev:
+ ln = prev_ln
+ line = prev + line
+ prev = ""
+ prev_ln = None
+
+ self.config.log.debug("%d %s: %s",
+ ln, state.name[self.state],
+ line)
+
+ # This is an optimization over the original script.
+ # There, when export_file was used for the same file,
+ # it was read twice. Here, we use the already-existing
+ # loop to parse exported symbols as well.
+ #
+ if (self.state != state.NORMAL) or \
+ not self.process_export(export_table, line):
+ # Hand this line to the appropriate state handler
+ self.state_actions[self.state](self, ln, line)
+
+ except OSError:
+ self.config.log.error(f"Error: Cannot open file {self.fname}")
+
+ return export_table, self.entries
diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py
new file mode 100644
index 000000000000..2dfa1bf83d64
--- /dev/null
+++ b/tools/lib/python/kdoc/kdoc_re.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
+
+"""
+Regular expression ancillary classes.
+
+Those help caching regular expressions and do matching for kernel-doc.
+"""
+
+import re
+
+# Local cache for regular expressions
+re_cache = {}
+
+
+class KernRe:
+ """
+ Helper class to simplify regex declaration and usage.
+
+ It calls re.compile for a given pattern. It also allows adding
+ regular expressions and define sub at class init time.
+
+ Regular expressions can be cached via an argument, helping to speedup
+ searches.
+ """
+
+ def _add_regex(self, string, flags):
+ """
+ Adds a new regex or reuses it from the cache.
+ """
+ self.regex = re_cache.get(string, None)
+ if not self.regex:
+ self.regex = re.compile(string, flags=flags)
+ if self.cache:
+ re_cache[string] = self.regex
+
+ def __init__(self, string, cache=True, flags=0):
+ """
+ Compile a regular expression and initialize internal vars.
+ """
+
+ self.cache = cache
+ self.last_match = None
+
+ self._add_regex(string, flags)
+
+ def __str__(self):
+ """
+ Return the regular expression pattern.
+ """
+ return self.regex.pattern
+
+ def __add__(self, other):
+ """
+ Allows adding two regular expressions into one.
+ """
+
+ return KernRe(str(self) + str(other), cache=self.cache or other.cache,
+ flags=self.regex.flags | other.regex.flags)
+
+ def match(self, string):
+ """
+ Handles a re.match storing its results
+ """
+
+ self.last_match = self.regex.match(string)
+ return self.last_match
+
+ def search(self, string):
+ """
+ Handles a re.search storing its results
+ """
+
+ self.last_match = self.regex.search(string)
+ return self.last_match
+
+ def findall(self, string):
+ """
+ Alias to re.findall
+ """
+
+ return self.regex.findall(string)
+
+ def split(self, string):
+ """
+ Alias to re.split
+ """
+
+ return self.regex.split(string)
+
+ def sub(self, sub, string, count=0):
+ """
+ Alias to re.sub
+ """
+
+ return self.regex.sub(sub, string, count=count)
+
+ def group(self, num):
+ """
+ Returns the group results of the last match
+ """
+
+ return self.last_match.group(num)
+
+
+class NestedMatch:
+ """
+ Finding nested delimiters is hard with regular expressions. It is
+ even harder on Python with its normal re module, as there are several
+ advanced regular expressions that are missing.
+
+ This is the case of this pattern:
+
+ '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
+
+ which is used to properly match open/close parentheses of the
+ string search STRUCT_GROUP(),
+
+ Add a class that counts pairs of delimiters, using it to match and
+ replace nested expressions.
+
+ The original approach was suggested by:
+ https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
+
+ Although I re-implemented it to make it more generic and match 3 types
+ of delimiters. The logic checks if delimiters are paired. If not, it
+ will ignore the search string.
+ """
+
+ # TODO: make NestedMatch handle multiple match groups
+ #
+ # Right now, regular expressions to match it are defined only up to
+ # the start delimiter, e.g.:
+ #
+ # \bSTRUCT_GROUP\(
+ #
+ # is similar to: STRUCT_GROUP\((.*)\)
+ # except that the content inside the match group is delimiter-aligned.
+ #
+ # The content inside parentheses is converted into a single replace
+ # group (e.g. r`\1').
+ #
+ # It would be nice to change such definition to support multiple
+ # match groups, allowing a regex equivalent to:
+ #
+ # FOO\((.*), (.*), (.*)\)
+ #
+ # it is probably easier to define it not as a regular expression, but
+ # with some lexical definition like:
+ #
+ # FOO(arg1, arg2, arg3)
+
+ DELIMITER_PAIRS = {
+ '{': '}',
+ '(': ')',
+ '[': ']',
+ }
+
+ RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
+
+ def _search(self, regex, line):
+ """
+ Finds paired blocks for a regex that ends with a delimiter.
+
+ The suggestion of using finditer to match pairs came from:
+ https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
+ but I ended using a different implementation to align all three types
+ of delimiters and seek for an initial regular expression.
+
+ The algorithm seeks for open/close paired delimiters and places them
+ into a stack, yielding a start/stop position of each match when the
+ stack is zeroed.
+
+ The algorithm should work fine for properly paired lines, but will
+ silently ignore end delimiters that precede a start delimiter.
+ This should be OK for kernel-doc parser, as unaligned delimiters
+ would cause compilation errors. So, we don't need to raise exceptions
+ to cover such issues.
+ """
+
+ stack = []
+
+ for match_re in regex.finditer(line):
+ start = match_re.start()
+ offset = match_re.end()
+
+ d = line[offset - 1]
+ if d not in self.DELIMITER_PAIRS:
+ continue
+
+ end = self.DELIMITER_PAIRS[d]
+ stack.append(end)
+
+ for match in self.RE_DELIM.finditer(line[offset:]):
+ pos = match.start() + offset
+
+ d = line[pos]
+
+ if d in self.DELIMITER_PAIRS:
+ end = self.DELIMITER_PAIRS[d]
+
+ stack.append(end)
+ continue
+
+ # Does the end delimiter match what is expected?
+ if stack and d == stack[-1]:
+ stack.pop()
+
+ if not stack:
+ yield start, offset, pos + 1
+ break
+
+ def search(self, regex, line):
+ """
+ This is similar to re.search:
+
+ It matches a regex that it is followed by a delimiter,
+ returning occurrences only if all delimiters are paired.
+ """
+
+ for t in self._search(regex, line):
+
+ yield line[t[0]:t[2]]
+
+ def sub(self, regex, sub, line, count=0):
+ """
+ This is similar to re.sub:
+
+ It matches a regex that it is followed by a delimiter,
+ replacing occurrences only if all delimiters are paired.
+
+ if r'\1' is used, it works just like re: it places there the
+ matched paired data with the delimiter stripped.
+
+ If count is different than zero, it will replace at most count
+ items.
+ """
+ out = ""
+
+ cur_pos = 0
+ n = 0
+
+ for start, end, pos in self._search(regex, line):
+ out += line[cur_pos:start]
+
+ # Value, ignoring start/end delimiters
+ value = line[end:pos - 1]
+
+ # replaces \1 at the sub string, if \1 is used there
+ new_sub = sub
+ new_sub = new_sub.replace(r'\1', value)
+
+ out += new_sub
+
+ # Drop end ';' if any
+ if line[pos] == ';':
+ pos += 1
+
+ cur_pos = pos
+ n += 1
+
+ if count and count >= n:
+ break
+
+ # Append the remaining string
+ l = len(line)
+ out += line[cur_pos:l]
+
+ return out
diff --git a/tools/lib/python/kdoc/latex_fonts.py b/tools/lib/python/kdoc/latex_fonts.py
new file mode 100755
index 000000000000..29317f8006ea
--- /dev/null
+++ b/tools/lib/python/kdoc/latex_fonts.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) Akira Yokosawa, 2024
+#
+# Ported to Python by (c) Mauro Carvalho Chehab, 2025
+
+"""
+Detect problematic Noto CJK variable fonts.
+
+For "make pdfdocs", reports of build errors of translations.pdf started
+arriving early 2024 [1, 2]. It turned out that Fedora and openSUSE
+tumbleweed have started deploying variable-font [3] format of "Noto CJK"
+fonts [4, 5]. For PDF, a LaTeX package named xeCJK is used for CJK
+(Chinese, Japanese, Korean) pages. xeCJK requires XeLaTeX/XeTeX, which
+does not (and likely never will) understand variable fonts for historical
+reasons.
+
+The build error happens even when both of variable- and non-variable-format
+fonts are found on the build system. To make matters worse, Fedora enlists
+variable "Noto CJK" fonts in the requirements of langpacks-ja, -ko, -zh_CN,
+-zh_TW, etc. Hence developers who have interest in CJK pages are more
+likely to encounter the build errors.
+
+This script is invoked from the error path of "make pdfdocs" and emits
+suggestions if variable-font files of "Noto CJK" fonts are in the list of
+fonts accessible from XeTeX.
+
+References:
+[1]: https://lore.kernel.org/r/8734tqsrt7.fsf@meer.lwn.net/
+[2]: https://lore.kernel.org/r/1708585803.600323099@f111.i.mail.ru/
+[3]: https://en.wikipedia.org/wiki/Variable_font
+[4]: https://fedoraproject.org/wiki/Changes/Noto_CJK_Variable_Fonts
+[5]: https://build.opensuse.org/request/show/1157217
+
+#===========================================================================
+Workarounds for building translations.pdf
+#===========================================================================
+
+* Denylist "variable font" Noto CJK fonts.
+ - Create $HOME/deny-vf/fontconfig/fonts.conf from template below, with
+ tweaks if necessary. Remove leading "".
+ - Path of fontconfig/fonts.conf can be overridden by setting an env
+ variable FONTS_CONF_DENY_VF.
+
+ * Template:
+-----------------------------------------------------------------
+<?xml version="1.0"?>
+<!DOCTYPE fontconfig SYSTEM "urn:fontconfig:fonts.dtd">
+<fontconfig>
+<!--
+ Ignore variable-font glob (not to break xetex)
+-->
+ <selectfont>
+ <rejectfont>
+ <!--
+ for Fedora
+ -->
+ <glob>/usr/share/fonts/google-noto-*-cjk-vf-fonts</glob>
+ <!--
+ for openSUSE tumbleweed
+ -->
+ <glob>/usr/share/fonts/truetype/Noto*CJK*-VF.otf</glob>
+ </rejectfont>
+ </selectfont>
+</fontconfig>
+-----------------------------------------------------------------
+
+ The denylisting is activated for "make pdfdocs".
+
+* For skipping CJK pages in PDF
+ - Uninstall texlive-xecjk.
+ Denylisting is not needed in this case.
+
+* For printing CJK pages in PDF
+ - Need non-variable "Noto CJK" fonts.
+ * Fedora
+ - google-noto-sans-cjk-fonts
+ - google-noto-serif-cjk-fonts
+ * openSUSE tumbleweed
+ - Non-variable "Noto CJK" fonts are not available as distro packages
+ as of April, 2024. Fetch a set of font files from upstream Noto
+ CJK Font released at:
+ https://github.com/notofonts/noto-cjk/tree/main/Sans#super-otc
+ and at:
+ https://github.com/notofonts/noto-cjk/tree/main/Serif#super-otc
+ , then uncompress and deploy them.
+ - Remember to update fontconfig cache by running fc-cache.
+
+!!! Caution !!!
+ Uninstalling "variable font" packages can be dangerous.
+ They might be depended upon by other packages important for your work.
+ Denylisting should be less invasive, as it is effective only while
+ XeLaTeX runs in "make pdfdocs".
+"""
+
+import os
+import re
+import subprocess
+import textwrap
+import sys
+
+class LatexFontChecker:
+ """
+ Detect problems with CJK variable fonts that affect PDF builds for
+ translations.
+ """
+
+ def __init__(self, deny_vf=None):
+ if not deny_vf:
+ deny_vf = os.environ.get('FONTS_CONF_DENY_VF', "~/deny-vf")
+
+ self.environ = os.environ.copy()
+ self.environ['XDG_CONFIG_HOME'] = os.path.expanduser(deny_vf)
+
+ self.re_cjk = re.compile(r"([^:]+):\s*Noto\s+(Sans|Sans Mono|Serif) CJK")
+
+ def description(self):
+ return __doc__
+
+ def get_noto_cjk_vf_fonts(self):
+ """Get Noto CJK fonts"""
+
+ cjk_fonts = set()
+ cmd = ["fc-list", ":", "file", "family", "variable"]
+ try:
+ result = subprocess.run(cmd,stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ universal_newlines=True,
+ env=self.environ,
+ check=True)
+
+ except subprocess.CalledProcessError as exc:
+ sys.exit(f"Error running fc-list: {repr(exc)}")
+
+ for line in result.stdout.splitlines():
+ if 'variable=True' not in line:
+ continue
+
+ match = self.re_cjk.search(line)
+ if match:
+ cjk_fonts.add(match.group(1))
+
+ return sorted(cjk_fonts)
+
+ def check(self):
+ """Check for problems with CJK fonts"""
+
+ fonts = textwrap.indent("\n".join(self.get_noto_cjk_vf_fonts()), " ")
+ if not fonts:
+ return None
+
+ rel_file = os.path.relpath(__file__, os.getcwd())
+
+ msg = "=" * 77 + "\n"
+ msg += 'XeTeX is confused by "variable font" files listed below:\n'
+ msg += fonts + "\n"
+ msg += textwrap.dedent(f"""
+ For CJK pages in PDF, they need to be hidden from XeTeX by denylisting.
+ Or, CJK pages can be skipped by uninstalling texlive-xecjk.
+
+ For more info on denylisting, other options, and variable font, run:
+
+ tools/docs/check-variable-fonts.py -h
+ """)
+ msg += "=" * 77
+
+ return msg
diff --git a/tools/docs/lib/parse_data_structs.py b/tools/lib/python/kdoc/parse_data_structs.py
index a5aa2e182052..25361996cd20 100755
--- a/tools/docs/lib/parse_data_structs.py
+++ b/tools/lib/python/kdoc/parse_data_structs.py
@@ -53,11 +53,19 @@ class ParseDataStructs:
replace <type> <old_symbol> <new_reference>
- Replaces how old_symbol with a new reference. The new_reference can be:
+ Replaces how old_symbol with a new reference. The new_reference can be:
+
- A simple symbol name;
- A full Sphinx reference.
- On both cases, <type> can be:
+ 3. Namespace rules
+
+ namespace <namespace>
+
+ Sets C namespace to be used during cross-reference generation. Can
+ be overridden by replace rules.
+
+ On ignore and replace rules, <type> can be:
- ioctl: for defines that end with _IO*, e.g. ioctl definitions
- define: for other defines
- symbol: for symbols defined within enums;
@@ -71,6 +79,8 @@ class ParseDataStructs:
ignore ioctl VIDIOC_ENUM_FMT
replace ioctl VIDIOC_DQBUF vidioc_qbuf
replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
+
+ namespace MC
"""
# Parser regexes with multiple ways to capture enums and structs
@@ -140,10 +150,96 @@ class ParseDataStructs:
self.symbols = {}
+ self.namespace = None
+ self.ignore = []
+ self.replace = []
+
for symbol_type in self.DEF_SYMBOL_TYPES:
self.symbols[symbol_type] = {}
- def store_type(self, symbol_type: str, symbol: str,
+ def read_exceptions(self, fname: str):
+ if not fname:
+ return
+
+ name = os.path.basename(fname)
+
+ with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f:
+ for ln, line in enumerate(f):
+ ln += 1
+ line = line.strip()
+ if not line or line.startswith("#"):
+ continue
+
+ # ignore rules
+ match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line)
+
+ if match:
+ self.ignore.append((ln, match.group(1), match.group(2)))
+ continue
+
+ # replace rules
+ match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line)
+ if match:
+ self.replace.append((ln, match.group(1), match.group(2),
+ match.group(3)))
+ continue
+
+ match = re.match(r"^namespace\s+(\S+)", line)
+ if match:
+ self.namespace = match.group(1)
+ continue
+
+ sys.exit(f"{name}:{ln}: invalid line: {line}")
+
+ def apply_exceptions(self):
+ """
+ Process exceptions file with rules to ignore or replace references.
+ """
+
+ # Handle ignore rules
+ for ln, c_type, symbol in self.ignore:
+ if c_type not in self.DEF_SYMBOL_TYPES:
+ sys.exit(f"{name}:{ln}: {c_type} is invalid")
+
+ d = self.symbols[c_type]
+ if symbol in d:
+ del d[symbol]
+
+ # Handle replace rules
+ for ln, c_type, old, new in self.replace:
+ if c_type not in self.DEF_SYMBOL_TYPES:
+ sys.exit(f"{name}:{ln}: {c_type} is invalid")
+
+ reftype = None
+
+ # Parse reference type when the type is specified
+
+ match = re.match(r"^\:c\:(\w+)\:\`(.+)\`", new)
+ if match:
+ reftype = f":c:{match.group(1)}"
+ new = match.group(2)
+ else:
+ match = re.search(r"(\:ref)\:\`(.+)\`", new)
+ if match:
+ reftype = match.group(1)
+ new = match.group(2)
+
+ # If the replacement rule doesn't have a type, get default
+ if not reftype:
+ reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type")
+ if not reftype:
+ reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type")
+
+ new_ref = f"{reftype}:`{old} <{new}>`"
+
+ # Change self.symbols to use the replacement rule
+ if old in self.symbols[c_type]:
+ (_, ln) = self.symbols[c_type][old]
+ self.symbols[c_type][old] = (new_ref, ln)
+ else:
+ print(f"{name}:{ln}: Warning: can't find {old} {c_type}")
+
+ def store_type(self, ln, symbol_type: str, symbol: str,
ref_name: str = None, replace_underscores: bool = True):
"""
Stores a new symbol at self.symbols under symbol_type.
@@ -157,35 +253,42 @@ class ParseDataStructs:
ref_type = defs.get("ref_type")
# Determine ref_link based on symbol type
- if ref_type:
- if symbol_type == "enum":
- ref_link = f"{ref_type}:`{symbol}`"
- else:
- if not ref_name:
- ref_name = symbol.lower()
+ if ref_type or self.namespace:
+ if not ref_name:
+ ref_name = symbol.lower()
+
+ # c-type references don't support hash
+ if ref_type == ":ref" and replace_underscores:
+ ref_name = ref_name.replace("_", "-")
- # c-type references don't support hash
- if ref_type == ":ref" and replace_underscores:
- ref_name = ref_name.replace("_", "-")
+ # C domain references may have namespaces
+ if ref_type.startswith(":c:"):
+ if self.namespace:
+ ref_name = f"{self.namespace}.{ref_name}"
+ if ref_type:
ref_link = f"{ref_type}:`{symbol} <{ref_name}>`"
+ else:
+ ref_link = f"`{symbol} <{ref_name}>`"
else:
ref_link = symbol
- self.symbols[symbol_type][symbol] = f"{prefix}{ref_link}{suffix}"
+ self.symbols[symbol_type][symbol] = (f"{prefix}{ref_link}{suffix}", ln)
def store_line(self, line):
"""Stores a line at self.data, properly indented"""
line = " " + line.expandtabs()
self.data += line.rstrip(" ")
- def parse_file(self, file_in: str):
+ def parse_file(self, file_in: str, exceptions: str = None):
"""Reads a C source file and get identifiers"""
self.data = ""
is_enum = False
is_comment = False
multiline = ""
+ self.read_exceptions(exceptions)
+
with open(file_in, "r",
encoding="utf-8", errors="backslashreplace") as f:
for line_no, line in enumerate(f):
@@ -240,20 +343,20 @@ class ParseDataStructs:
if is_enum:
match = re.match(r"^\s*([_\w][\w\d_]+)\s*[\,=]?", line)
if match:
- self.store_type("symbol", match.group(1))
+ self.store_type(line_no, "symbol", match.group(1))
if "}" in line:
is_enum = False
continue
match = re.match(r"^\s*#\s*define\s+([\w_]+)\s+_IO", line)
if match:
- self.store_type("ioctl", match.group(1),
+ self.store_type(line_no, "ioctl", match.group(1),
replace_underscores=False)
continue
match = re.match(r"^\s*#\s*define\s+([\w_]+)(\s+|$)", line)
if match:
- self.store_type("define", match.group(1))
+ self.store_type(line_no, "define", match.group(1))
continue
match = re.match(r"^\s*typedef\s+([_\w][\w\d_]+)\s+(.*)\s+([_\w][\w\d_]+);",
@@ -261,90 +364,23 @@ class ParseDataStructs:
if match:
name = match.group(2).strip()
symbol = match.group(3)
- self.store_type("typedef", symbol, ref_name=name)
+ self.store_type(line_no, "typedef", symbol, ref_name=name)
continue
for re_enum in self.RE_ENUMS:
match = re_enum.match(line)
if match:
- self.store_type("enum", match.group(1))
+ self.store_type(line_no, "enum", match.group(1))
is_enum = True
break
for re_struct in self.RE_STRUCTS:
match = re_struct.match(line)
if match:
- self.store_type("struct", match.group(1))
+ self.store_type(line_no, "struct", match.group(1))
break
- def process_exceptions(self, fname: str):
- """
- Process exceptions file with rules to ignore or replace references.
- """
- if not fname:
- return
-
- name = os.path.basename(fname)
-
- with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f:
- for ln, line in enumerate(f):
- ln += 1
- line = line.strip()
- if not line or line.startswith("#"):
- continue
-
- # Handle ignore rules
- match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line)
- if match:
- c_type = match.group(1)
- symbol = match.group(2)
-
- if c_type not in self.DEF_SYMBOL_TYPES:
- sys.exit(f"{name}:{ln}: {c_type} is invalid")
-
- d = self.symbols[c_type]
- if symbol in d:
- del d[symbol]
-
- continue
-
- # Handle replace rules
- match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line)
- if not match:
- sys.exit(f"{name}:{ln}: invalid line: {line}")
-
- c_type, old, new = match.groups()
-
- if c_type not in self.DEF_SYMBOL_TYPES:
- sys.exit(f"{name}:{ln}: {c_type} is invalid")
-
- reftype = None
-
- # Parse reference type when the type is specified
-
- match = re.match(r"^\:c\:(data|func|macro|type)\:\`(.+)\`", new)
- if match:
- reftype = f":c:{match.group(1)}"
- new = match.group(2)
- else:
- match = re.search(r"(\:ref)\:\`(.+)\`", new)
- if match:
- reftype = match.group(1)
- new = match.group(2)
-
- # If the replacement rule doesn't have a type, get default
- if not reftype:
- reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type")
- if not reftype:
- reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type")
-
- new_ref = f"{reftype}:`{old} <{new}>`"
-
- # Change self.symbols to use the replacement rule
- if old in self.symbols[c_type]:
- self.symbols[c_type][old] = new_ref
- else:
- print(f"{name}:{ln}: Warning: can't find {old} {c_type}")
+ self.apply_exceptions()
def debug_print(self):
"""
@@ -360,8 +396,8 @@ class ParseDataStructs:
print(f"{c_type}:")
- for symbol, ref in sorted(refs.items()):
- print(f" {symbol} -> {ref}")
+ for symbol, (ref, ln) in sorted(refs.items()):
+ print(f" #{ln:<5d} {symbol} -> {ref}")
print()
@@ -384,7 +420,7 @@ class ParseDataStructs:
# Process all reference types
for ref_dict in self.symbols.values():
- for symbol, replacement in ref_dict.items():
+ for symbol, (replacement, _) in ref_dict.items():
symbol = re.escape(re.sub(r"([\_\`\*\<\>\&\\\\:\/])", r"\\\1", symbol))
text = re.sub(fr'{start_delim}{symbol}{end_delim}',
fr'\1{replacement}\2', text)
@@ -397,16 +433,10 @@ class ParseDataStructs:
def gen_toc(self):
"""
- Create a TOC table pointing to each symbol from the header
+ Create a list of symbols to be part of a TOC contents table
"""
text = []
- # Add header
- text.append(".. contents:: Table of Contents")
- text.append(" :depth: 2")
- text.append(" :local:")
- text.append("")
-
# Sort symbol types per description
symbol_descriptions = []
for k, v in self.DEF_SYMBOL_TYPES.items():
@@ -426,8 +456,8 @@ class ParseDataStructs:
text.append("")
# Sort symbols alphabetically
- for symbol, ref in sorted(refs.items()):
- text.append(f"* :{ref}:")
+ for symbol, (ref, ln) in sorted(refs.items()):
+ text.append(f"- LINENO_{ln}: {ref}")
text.append("") # Add empty line between categories
diff --git a/tools/lib/python/kdoc/python_version.py b/tools/lib/python/kdoc/python_version.py
new file mode 100644
index 000000000000..e83088013db2
--- /dev/null
+++ b/tools/lib/python/kdoc/python_version.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2017-2025 Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+
+"""
+Handle Python version check logic.
+
+Not all Python versions are supported by scripts. Yet, on some cases,
+like during documentation build, a newer version of python could be
+available.
+
+This class allows checking if the minimal requirements are followed.
+
+Better than that, PythonVersion.check_python() not only checks the minimal
+requirements, but it automatically switches to a the newest available
+Python version if present.
+
+"""
+
+import os
+import re
+import subprocess
+import shlex
+import sys
+
+from glob import glob
+from textwrap import indent
+
+class PythonVersion:
+ """
+ Ancillary methods that checks for missing dependencies for different
+ types of types, like binaries, python modules, rpm deps, etc.
+ """
+
+ def __init__(self, version):
+ """Ïnitialize self.version tuple from a version string"""
+ self.version = self.parse_version(version)
+
+ @staticmethod
+ def parse_version(version):
+ """Convert a major.minor.patch version into a tuple"""
+ return tuple(int(x) for x in version.split("."))
+
+ @staticmethod
+ def ver_str(version):
+ """Returns a version tuple as major.minor.patch"""
+ return ".".join([str(x) for x in version])
+
+ @staticmethod
+ def cmd_print(cmd, max_len=80):
+ cmd_line = []
+
+ for w in cmd:
+ w = shlex.quote(w)
+
+ if cmd_line:
+ if not max_len or len(cmd_line[-1]) + len(w) < max_len:
+ cmd_line[-1] += " " + w
+ continue
+ else:
+ cmd_line[-1] += " \\"
+ cmd_line.append(w)
+ else:
+ cmd_line.append(w)
+
+ return "\n ".join(cmd_line)
+
+ def __str__(self):
+ """Returns a version tuple as major.minor.patch from self.version"""
+ return self.ver_str(self.version)
+
+ @staticmethod
+ def get_python_version(cmd):
+ """
+ Get python version from a Python binary. As we need to detect if
+ are out there newer python binaries, we can't rely on sys.release here.
+ """
+
+ kwargs = {}
+ if sys.version_info < (3, 7):
+ kwargs['universal_newlines'] = True
+ else:
+ kwargs['text'] = True
+
+ result = subprocess.run([cmd, "--version"],
+ stdout = subprocess.PIPE,
+ stderr = subprocess.PIPE,
+ **kwargs, check=False)
+
+ version = result.stdout.strip()
+
+ match = re.search(r"(\d+\.\d+\.\d+)", version)
+ if match:
+ return PythonVersion.parse_version(match.group(1))
+
+ print(f"Can't parse version {version}")
+ return (0, 0, 0)
+
+ @staticmethod
+ def find_python(min_version):
+ """
+ Detect if are out there any python 3.xy version newer than the
+ current one.
+
+ Note: this routine is limited to up to 2 digits for python3. We
+ may need to update it one day, hopefully on a distant future.
+ """
+ patterns = [
+ "python3.[0-9][0-9]",
+ "python3.[0-9]",
+ ]
+
+ python_cmd = []
+
+ # Seek for a python binary newer than min_version
+ for path in os.getenv("PATH", "").split(":"):
+ for pattern in patterns:
+ for cmd in glob(os.path.join(path, pattern)):
+ if os.path.isfile(cmd) and os.access(cmd, os.X_OK):
+ version = PythonVersion.get_python_version(cmd)
+ if version >= min_version:
+ python_cmd.append((version, cmd))
+
+ return sorted(python_cmd, reverse=True)
+
+ @staticmethod
+ def check_python(min_version, show_alternatives=False, bail_out=False,
+ success_on_error=False):
+ """
+ Check if the current python binary satisfies our minimal requirement
+ for Sphinx build. If not, re-run with a newer version if found.
+ """
+ cur_ver = sys.version_info[:3]
+ if cur_ver >= min_version:
+ ver = PythonVersion.ver_str(cur_ver)
+ return
+
+ python_ver = PythonVersion.ver_str(cur_ver)
+
+ available_versions = PythonVersion.find_python(min_version)
+ if not available_versions:
+ print(f"ERROR: Python version {python_ver} is not supported anymore\n")
+ print(" Can't find a new version. This script may fail")
+ return
+
+ script_path = os.path.abspath(sys.argv[0])
+
+ # Check possible alternatives
+ if available_versions:
+ new_python_cmd = available_versions[0][1]
+ else:
+ new_python_cmd = None
+
+ if show_alternatives and available_versions:
+ print("You could run, instead:")
+ for _, cmd in available_versions:
+ args = [cmd, script_path] + sys.argv[1:]
+
+ cmd_str = indent(PythonVersion.cmd_print(args), " ")
+ print(f"{cmd_str}\n")
+
+ if bail_out:
+ msg = f"Python {python_ver} not supported. Bailing out"
+ if success_on_error:
+ print(msg, file=sys.stderr)
+ sys.exit(0)
+ else:
+ sys.exit(msg)
+
+ print(f"Python {python_ver} not supported. Changing to {new_python_cmd}")
+
+ # Restart script using the newer version
+ args = [new_python_cmd, script_path] + sys.argv[1:]
+
+ try:
+ os.execv(new_python_cmd, args)
+ except OSError as e:
+ sys.exit(f"Failed to restart with {new_python_cmd}: {e}")
diff --git a/tools/net/ynl/lib/ynl-priv.h b/tools/net/ynl/lib/ynl-priv.h
index 29481989ea76..ced7dce44efb 100644
--- a/tools/net/ynl/lib/ynl-priv.h
+++ b/tools/net/ynl/lib/ynl-priv.h
@@ -313,7 +313,7 @@ ynl_attr_put_str(struct nlmsghdr *nlh, unsigned int attr_type, const char *str)
struct nlattr *attr;
size_t len;
- len = strlen(str);
+ len = strlen(str) + 1;
if (__ynl_attr_put_overflow(nlh, len))
return;
@@ -321,7 +321,7 @@ ynl_attr_put_str(struct nlmsghdr *nlh, unsigned int attr_type, const char *str)
attr->nla_type = attr_type;
strcpy((char *)ynl_attr_data(attr), str);
- attr->nla_len = NLA_HDRLEN + NLA_ALIGN(len);
+ attr->nla_len = NLA_HDRLEN + len;
nlh->nlmsg_len += NLMSG_ALIGN(attr->nla_len);
}
diff --git a/tools/net/ynl/pyynl/ethtool.py b/tools/net/ynl/pyynl/ethtool.py
index 9b523cbb3568..fd0f6b8d54d1 100755
--- a/tools/net/ynl/pyynl/ethtool.py
+++ b/tools/net/ynl/pyynl/ethtool.py
@@ -44,6 +44,9 @@ def print_field(reply, *desc):
Pretty-print a set of fields from the reply. desc specifies the
fields and the optional type (bool/yn).
"""
+ if not reply:
+ return
+
if len(desc) == 0:
return print_field(reply, *zip(reply.keys(), reply.keys()))
diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py
index 58086b101057..aadeb3abcad8 100755
--- a/tools/net/ynl/pyynl/ynl_gen_c.py
+++ b/tools/net/ynl/pyynl/ynl_gen_c.py
@@ -861,6 +861,18 @@ class TypeIndexedArray(Type):
return [f"{member} = {self.c_name};",
f"{presence} = n_{self.c_name};"]
+ def free_needs_iter(self):
+ return self.sub_type == 'nest'
+
+ def _free_lines(self, ri, var, ref):
+ lines = []
+ if self.sub_type == 'nest':
+ lines += [
+ f"for (i = 0; i < {var}->{ref}_count.{self.c_name}; i++)",
+ f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name}[i]);',
+ ]
+ lines += f"free({var}->{ref}{self.c_name});",
+ return lines
class TypeNestTypeValue(Type):
def _complex_member_type(self, ri):
diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore
index 4faa4dd72f35..73d883128511 100644
--- a/tools/objtool/.gitignore
+++ b/tools/objtool/.gitignore
@@ -1,5 +1,8 @@
# SPDX-License-Identifier: GPL-2.0-only
+arch/x86/lib/cpu-feature-names.c
arch/x86/lib/inat-tables.c
/objtool
+feature
+FEATURE-DUMP.objtool
fixdep
libsubcmd/
diff --git a/tools/objtool/Build b/tools/objtool/Build
index a3cdf8af6635..9982e665d58d 100644
--- a/tools/objtool/Build
+++ b/tools/objtool/Build
@@ -8,8 +8,11 @@ objtool-y += builtin-check.o
objtool-y += elf.o
objtool-y += objtool.o
-objtool-$(BUILD_ORC) += orc_gen.o
-objtool-$(BUILD_ORC) += orc_dump.o
+objtool-$(BUILD_DISAS) += disas.o
+objtool-$(BUILD_DISAS) += trace.o
+
+objtool-$(BUILD_ORC) += orc_gen.o orc_dump.o
+objtool-$(BUILD_KLP) += builtin-klp.o klp-diff.o klp-post-link.o
objtool-y += libstring.o
objtool-y += libctype.o
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 8c20361dd100..ad6e1ec706ce 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -2,6 +2,28 @@
include ../scripts/Makefile.include
include ../scripts/Makefile.arch
+ifeq ($(SRCARCH),x86)
+ BUILD_ORC := y
+ ARCH_HAS_KLP := y
+endif
+
+ifeq ($(SRCARCH),loongarch)
+ BUILD_ORC := y
+endif
+
+ifeq ($(ARCH_HAS_KLP),y)
+ HAVE_XXHASH = $(shell printf "$(pound)include <xxhash.h>\nXXH3_state_t *state;int main() {}" | \
+ $(HOSTCC) -xc - -o /dev/null -lxxhash 2> /dev/null && echo y || echo n)
+ ifeq ($(HAVE_XXHASH),y)
+ BUILD_KLP := y
+ LIBXXHASH_CFLAGS := $(shell $(HOSTPKG_CONFIG) libxxhash --cflags 2>/dev/null) \
+ -DBUILD_KLP
+ LIBXXHASH_LIBS := $(shell $(HOSTPKG_CONFIG) libxxhash --libs 2>/dev/null || echo -lxxhash)
+ endif
+endif
+
+export BUILD_ORC BUILD_KLP
+
ifeq ($(srctree),)
srctree := $(patsubst %/,%,$(dir $(CURDIR)))
srctree := $(patsubst %/,%,$(dir $(srctree)))
@@ -23,6 +45,11 @@ LIBELF_LIBS := $(shell $(HOSTPKG_CONFIG) libelf --libs 2>/dev/null || echo -lel
all: $(OBJTOOL)
+WARNINGS := -Werror -Wall -Wextra -Wmissing-prototypes \
+ -Wmissing-declarations -Wwrite-strings \
+ -Wno-implicit-fallthrough -Wno-sign-compare \
+ -Wno-unused-parameter
+
INCLUDES := -I$(srctree)/tools/include \
-I$(srctree)/tools/include/uapi \
-I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \
@@ -30,11 +57,11 @@ INCLUDES := -I$(srctree)/tools/include \
-I$(srctree)/tools/objtool/include \
-I$(srctree)/tools/objtool/arch/$(SRCARCH)/include \
-I$(LIBSUBCMD_OUTPUT)/include
-# Note, EXTRA_WARNINGS here was determined for CC and not HOSTCC, it
-# is passed here to match a legacy behavior.
-WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs
-OBJTOOL_CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS)
-OBJTOOL_LDFLAGS := $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS)
+
+OBJTOOL_CFLAGS := -std=gnu11 -fomit-frame-pointer -O2 -g $(WARNINGS) \
+ $(INCLUDES) $(LIBELF_FLAGS) $(LIBXXHASH_CFLAGS) $(HOSTCFLAGS)
+
+OBJTOOL_LDFLAGS := $(LIBSUBCMD) $(LIBELF_LIBS) $(LIBXXHASH_LIBS) $(HOSTLDFLAGS)
# Allow old libelf to be used:
elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(HOSTCC) $(OBJTOOL_CFLAGS) -x c -E - 2>/dev/null | grep elf_getshdr)
@@ -43,20 +70,32 @@ OBJTOOL_CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED)
# Always want host compilation.
HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)"
-AWK = awk
-MKDIR = mkdir
+#
+# To support disassembly, objtool needs libopcodes which is provided
+# with libbdf (binutils-dev or binutils-devel package).
+#
+FEATURE_USER = .objtool
+FEATURE_TESTS = libbfd disassembler-init-styled
+FEATURE_DISPLAY =
+include $(srctree)/tools/build/Makefile.feature
+
+ifeq ($(feature-disassembler-init-styled), 1)
+ OBJTOOL_CFLAGS += -DDISASM_INIT_STYLED
+endif
-BUILD_ORC := n
+BUILD_DISAS := n
-ifeq ($(SRCARCH),x86)
- BUILD_ORC := y
+ifeq ($(feature-libbfd),1)
+ BUILD_DISAS := y
+ OBJTOOL_CFLAGS += -DDISAS -DPACKAGE="objtool"
+ OBJTOOL_LDFLAGS += -lopcodes
endif
-ifeq ($(SRCARCH),loongarch)
- BUILD_ORC := y
-endif
+export BUILD_DISAS
+
+AWK = awk
+MKDIR = mkdir
-export BUILD_ORC
export srctree OUTPUT CFLAGS SRCARCH AWK
include $(srctree)/tools/build/Makefile.include
@@ -86,7 +125,10 @@ $(LIBSUBCMD)-clean:
clean: $(LIBSUBCMD)-clean
$(call QUIET_CLEAN, objtool) $(RM) $(OBJTOOL)
$(Q)find $(OUTPUT) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
+ $(Q)$(RM) $(OUTPUT)arch/x86/lib/cpu-feature-names.c $(OUTPUT)fixdep
$(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep
+ $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.objtool
+ $(Q)$(RM) -r -- $(OUTPUT)feature
FORCE:
diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c
index 2e555c4060c5..6cd288150f49 100644
--- a/tools/objtool/arch/loongarch/decode.c
+++ b/tools/objtool/arch/loongarch/decode.c
@@ -1,13 +1,25 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <string.h>
#include <objtool/check.h>
+#include <objtool/disas.h>
#include <objtool/warn.h>
#include <asm/inst.h>
#include <asm/orc_types.h>
#include <linux/objtool_types.h>
#include <arch/elf.h>
-int arch_ftrace_match(char *name)
+const char *arch_reg_name[CFI_NUM_REGS] = {
+ "zero", "ra", "tp", "sp",
+ "a0", "a1", "a2", "a3",
+ "a4", "a5", "a6", "a7",
+ "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6", "t7",
+ "t8", "u0", "fp", "s0",
+ "s1", "s2", "s3", "s4",
+ "s5", "s6", "s7", "s8"
+};
+
+int arch_ftrace_match(const char *name)
{
return !strcmp(name, "_mcount");
}
@@ -17,9 +29,9 @@ unsigned long arch_jump_destination(struct instruction *insn)
return insn->offset + (insn->immediate << 2);
}
-unsigned long arch_dest_reloc_offset(int addend)
+s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc)
{
- return addend;
+ return reloc_addend(reloc);
}
bool arch_pc_relative_reloc(struct reloc *reloc)
@@ -414,3 +426,14 @@ unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *tabl
return reloc->sym->offset + reloc_addend(reloc);
}
}
+
+#ifdef DISAS
+
+int arch_disas_info_init(struct disassemble_info *dinfo)
+{
+ return disas_info_init(dinfo, bfd_arch_loongarch,
+ bfd_mach_loongarch32, bfd_mach_loongarch64,
+ NULL);
+}
+
+#endif /* DISAS */
diff --git a/tools/objtool/arch/loongarch/orc.c b/tools/objtool/arch/loongarch/orc.c
index b58c5ff443c9..ffd3a3c858ae 100644
--- a/tools/objtool/arch/loongarch/orc.c
+++ b/tools/objtool/arch/loongarch/orc.c
@@ -5,7 +5,6 @@
#include <objtool/check.h>
#include <objtool/orc.h>
#include <objtool/warn.h>
-#include <objtool/endianness.h>
int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, struct instruction *insn)
{
diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c
index a80b75f7b061..aba774109437 100644
--- a/tools/objtool/arch/loongarch/special.c
+++ b/tools/objtool/arch/loongarch/special.c
@@ -194,3 +194,8 @@ struct reloc *arch_find_switch_table(struct objtool_file *file,
return rodata_reloc;
}
+
+const char *arch_cpu_feature_name(int feature_number)
+{
+ return NULL;
+}
diff --git a/tools/objtool/arch/powerpc/decode.c b/tools/objtool/arch/powerpc/decode.c
index c851c51d4bd3..e534ac1123b3 100644
--- a/tools/objtool/arch/powerpc/decode.c
+++ b/tools/objtool/arch/powerpc/decode.c
@@ -3,20 +3,32 @@
#include <stdio.h>
#include <stdlib.h>
#include <objtool/check.h>
+#include <objtool/disas.h>
#include <objtool/elf.h>
#include <objtool/arch.h>
#include <objtool/warn.h>
#include <objtool/builtin.h>
-#include <objtool/endianness.h>
-int arch_ftrace_match(char *name)
+const char *arch_reg_name[CFI_NUM_REGS] = {
+ "r0", "sp", "r2", "r3",
+ "r4", "r5", "r6", "r7",
+ "r8", "r9", "r10", "r11",
+ "r12", "r13", "r14", "r15",
+ "r16", "r17", "r18", "r19",
+ "r20", "r21", "r22", "r23",
+ "r24", "r25", "r26", "r27",
+ "r28", "r29", "r30", "r31",
+ "ra"
+};
+
+int arch_ftrace_match(const char *name)
{
return !strcmp(name, "_mcount");
}
-unsigned long arch_dest_reloc_offset(int addend)
+s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc)
{
- return addend;
+ return reloc_addend(reloc);
}
bool arch_callee_saved_reg(unsigned char reg)
@@ -128,3 +140,14 @@ unsigned int arch_reloc_size(struct reloc *reloc)
return 8;
}
}
+
+#ifdef DISAS
+
+int arch_disas_info_init(struct disassemble_info *dinfo)
+{
+ return disas_info_init(dinfo, bfd_arch_powerpc,
+ bfd_mach_ppc, bfd_mach_ppc64,
+ NULL);
+}
+
+#endif /* DISAS */
diff --git a/tools/objtool/arch/powerpc/special.c b/tools/objtool/arch/powerpc/special.c
index 51610689abf7..8f9bf61ca089 100644
--- a/tools/objtool/arch/powerpc/special.c
+++ b/tools/objtool/arch/powerpc/special.c
@@ -18,3 +18,8 @@ struct reloc *arch_find_switch_table(struct objtool_file *file,
{
exit(-1);
}
+
+const char *arch_cpu_feature_name(int feature_number)
+{
+ return NULL;
+}
diff --git a/tools/objtool/arch/x86/Build b/tools/objtool/arch/x86/Build
index 3dedb2fd8f3a..febee0b8ee0b 100644
--- a/tools/objtool/arch/x86/Build
+++ b/tools/objtool/arch/x86/Build
@@ -1,5 +1,5 @@
-objtool-y += special.o
objtool-y += decode.o
+objtool-y += special.o
objtool-y += orc.o
inat_tables_script = ../arch/x86/tools/gen-insn-attr-x86.awk
@@ -12,3 +12,14 @@ $(OUTPUT)arch/x86/lib/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
$(OUTPUT)arch/x86/decode.o: $(OUTPUT)arch/x86/lib/inat-tables.c
CFLAGS_decode.o += -I$(OUTPUT)arch/x86/lib
+
+cpu_features = ../arch/x86/include/asm/cpufeatures.h
+cpu_features_script = ../arch/x86/tools/gen-cpu-feature-names-x86.awk
+
+$(OUTPUT)arch/x86/lib/cpu-feature-names.c: $(cpu_features_script) $(cpu_features)
+ $(call rule_mkdir)
+ $(Q)$(call echo-cmd,gen)$(AWK) -f $(cpu_features_script) $(cpu_features) > $@
+
+$(OUTPUT)arch/x86/special.o: $(OUTPUT)arch/x86/lib/cpu-feature-names.c
+
+CFLAGS_special.o += -I$(OUTPUT)arch/x86/lib
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 0ad5cc70ecbe..f4af82508228 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -16,14 +16,22 @@
#include <asm/orc_types.h>
#include <objtool/check.h>
+#include <objtool/disas.h>
#include <objtool/elf.h>
#include <objtool/arch.h>
#include <objtool/warn.h>
-#include <objtool/endianness.h>
#include <objtool/builtin.h>
#include <arch/elf.h>
-int arch_ftrace_match(char *name)
+const char *arch_reg_name[CFI_NUM_REGS] = {
+ "rax", "rcx", "rdx", "rbx",
+ "rsp", "rbp", "rsi", "rdi",
+ "r8", "r9", "r10", "r11",
+ "r12", "r13", "r14", "r15",
+ "ra"
+};
+
+int arch_ftrace_match(const char *name)
{
return !strcmp(name, "__fentry__");
}
@@ -68,9 +76,65 @@ bool arch_callee_saved_reg(unsigned char reg)
}
}
-unsigned long arch_dest_reloc_offset(int addend)
+/* Undo the effects of __pa_symbol() if necessary */
+static unsigned long phys_to_virt(unsigned long pa)
+{
+ s64 va = pa;
+
+ if (va > 0)
+ va &= ~(0x80000000);
+
+ return va;
+}
+
+s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc)
+{
+ s64 addend = reloc_addend(reloc);
+
+ if (arch_pc_relative_reloc(reloc))
+ addend += insn->offset + insn->len - reloc_offset(reloc);
+
+ return phys_to_virt(addend);
+}
+
+static void scan_for_insn(struct section *sec, unsigned long offset,
+ unsigned long *insn_off, unsigned int *insn_len)
{
- return addend + 4;
+ unsigned long o = 0;
+ struct insn insn;
+
+ while (1) {
+
+ insn_decode(&insn, sec->data->d_buf + o, sec_size(sec) - o,
+ INSN_MODE_64);
+
+ if (o + insn.length > offset) {
+ *insn_off = o;
+ *insn_len = insn.length;
+ return;
+ }
+
+ o += insn.length;
+ }
+}
+
+u64 arch_adjusted_addend(struct reloc *reloc)
+{
+ unsigned int type = reloc_type(reloc);
+ s64 addend = reloc_addend(reloc);
+ unsigned long insn_off;
+ unsigned int insn_len;
+
+ if (type == R_X86_64_PLT32)
+ return addend + 4;
+
+ if (type != R_X86_64_PC32 || !is_text_sec(reloc->sec->base))
+ return addend;
+
+ scan_for_insn(reloc->sec->base, reloc_offset(reloc),
+ &insn_off, &insn_len);
+
+ return addend + insn_off + insn_len - reloc_offset(reloc);
}
unsigned long arch_jump_destination(struct instruction *insn)
@@ -189,15 +253,6 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
op2 = ins.opcode.bytes[1];
op3 = ins.opcode.bytes[2];
- /*
- * XXX hack, decoder is buggered and thinks 0xea is 7 bytes long.
- */
- if (op1 == 0xea) {
- insn->len = 1;
- insn->type = INSN_BUG;
- return 0;
- }
-
if (ins.rex_prefix.nbytes) {
rex = ins.rex_prefix.bytes[0];
rex_w = X86_REX_W(rex) >> 3;
@@ -503,6 +558,12 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
break;
case 0x90:
+ if (rex_b) /* XCHG %r8, %rax */
+ break;
+
+ if (prefix == 0xf3) /* REP NOP := PAUSE */
+ break;
+
insn->type = INSN_NOP;
break;
@@ -556,13 +617,14 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
} else if (op2 == 0x0b || op2 == 0xb9) {
- /* ud2 */
+ /* ud2, ud1 */
insn->type = INSN_BUG;
- } else if (op2 == 0x0d || op2 == 0x1f) {
+ } else if (op2 == 0x1f) {
- /* nopl/nopw */
- insn->type = INSN_NOP;
+ /* 0f 1f /0 := NOPL */
+ if (modrm_reg == 0)
+ insn->type = INSN_NOP;
} else if (op2 == 0x1e) {
@@ -692,6 +754,10 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
insn->type = INSN_SYSRET;
break;
+ case 0xd6: /* udb */
+ insn->type = INSN_BUG;
+ break;
+
case 0xe0: /* loopne */
case 0xe1: /* loope */
case 0xe2: /* loop */
@@ -892,3 +958,14 @@ bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc)
return false;
}
}
+
+#ifdef DISAS
+
+int arch_disas_info_init(struct disassemble_info *dinfo)
+{
+ return disas_info_init(dinfo, bfd_arch_i386,
+ bfd_mach_i386_i386, bfd_mach_x86_64,
+ "att");
+}
+
+#endif /* DISAS */
diff --git a/tools/objtool/arch/x86/orc.c b/tools/objtool/arch/x86/orc.c
index 7176b9ec5b05..735e150ca6b7 100644
--- a/tools/objtool/arch/x86/orc.c
+++ b/tools/objtool/arch/x86/orc.c
@@ -5,7 +5,6 @@
#include <objtool/check.h>
#include <objtool/orc.h>
#include <objtool/warn.h>
-#include <objtool/endianness.h>
int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, struct instruction *insn)
{
diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c
index 06ca4a2659a4..e817a3fff449 100644
--- a/tools/objtool/arch/x86/special.c
+++ b/tools/objtool/arch/x86/special.c
@@ -4,6 +4,10 @@
#include <objtool/special.h>
#include <objtool/builtin.h>
#include <objtool/warn.h>
+#include <asm/cpufeatures.h>
+
+/* cpu feature name array generated from cpufeatures.h */
+#include "cpu-feature-names.c"
void arch_handle_alternative(struct special_alt *alt)
{
@@ -89,7 +93,7 @@ struct reloc *arch_find_switch_table(struct objtool_file *file,
/* look for a relocation which references .rodata */
text_reloc = find_reloc_by_dest_range(file->elf, insn->sec,
insn->offset, insn->len);
- if (!text_reloc || text_reloc->sym->type != STT_SECTION ||
+ if (!text_reloc || !is_sec_sym(text_reloc->sym) ||
!text_reloc->sym->sec->rodata)
return NULL;
@@ -134,3 +138,9 @@ struct reloc *arch_find_switch_table(struct objtool_file *file,
*table_size = 0;
return rodata_reloc;
}
+
+const char *arch_cpu_feature_name(int feature_number)
+{
+ return (feature_number < ARRAY_SIZE(cpu_feature_names)) ?
+ cpu_feature_names[feature_number] : NULL;
+}
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index 0f6b197cfcb0..b780df513715 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -73,35 +73,41 @@ static int parse_hacks(const struct option *opt, const char *str, int unset)
static const struct option check_options[] = {
OPT_GROUP("Actions:"),
+ OPT_BOOLEAN(0, "checksum", &opts.checksum, "generate per-function checksums"),
+ OPT_BOOLEAN(0, "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"),
+ OPT_STRING_OPTARG('d', "disas", &opts.disas, "function-pattern", "disassemble functions", "*"),
OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr,skylake", "patch toolchain bugs/limitations", parse_hacks),
- OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"),
- OPT_BOOLEAN('m', "mcount", &opts.mcount, "annotate mcount/fentry calls for ftrace"),
- OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"),
- OPT_BOOLEAN(0, "orc", &opts.orc, "generate ORC metadata"),
- OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"),
- OPT_BOOLEAN(0, "rethunk", &opts.rethunk, "validate and annotate rethunk usage"),
- OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"),
- OPT_INTEGER(0, "prefix", &opts.prefix, "generate prefix symbols"),
- OPT_BOOLEAN('l', "sls", &opts.sls, "validate straight-line-speculation mitigations"),
- OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"),
- OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"),
- OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"),
- OPT_BOOLEAN(0 , "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"),
- OPT_BOOLEAN(0 , "noabs", &opts.noabs, "reject absolute references in allocatable sections"),
- OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump),
+ OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"),
+ OPT_BOOLEAN('m', "mcount", &opts.mcount, "annotate mcount/fentry calls for ftrace"),
+ OPT_BOOLEAN(0, "noabs", &opts.noabs, "reject absolute references in allocatable sections"),
+ OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"),
+ OPT_BOOLEAN(0, "orc", &opts.orc, "generate ORC metadata"),
+ OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"),
+ OPT_BOOLEAN(0, "rethunk", &opts.rethunk, "validate and annotate rethunk usage"),
+ OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"),
+ OPT_INTEGER(0, "prefix", &opts.prefix, "generate prefix symbols"),
+ OPT_BOOLEAN('l', "sls", &opts.sls, "validate straight-line-speculation mitigations"),
+ OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"),
+ OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"),
+ OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"),
+ OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump),
OPT_GROUP("Options:"),
- OPT_BOOLEAN(0, "backtrace", &opts.backtrace, "unwind on error"),
- OPT_BOOLEAN(0, "dry-run", &opts.dryrun, "don't write modifications"),
- OPT_BOOLEAN(0, "link", &opts.link, "object is a linked object"),
- OPT_BOOLEAN(0, "module", &opts.module, "object is part of a kernel module"),
- OPT_BOOLEAN(0, "mnop", &opts.mnop, "nop out mcount call sites"),
- OPT_BOOLEAN(0, "no-unreachable", &opts.no_unreachable, "skip 'unreachable instruction' warnings"),
- OPT_STRING('o', "output", &opts.output, "file", "output file name"),
- OPT_BOOLEAN(0, "sec-address", &opts.sec_address, "print section addresses in warnings"),
- OPT_BOOLEAN(0, "stats", &opts.stats, "print statistics"),
- OPT_BOOLEAN('v', "verbose", &opts.verbose, "verbose warnings"),
- OPT_BOOLEAN(0, "Werror", &opts.werror, "return error on warnings"),
+ OPT_BOOLEAN(0, "backtrace", &opts.backtrace, "unwind on error"),
+ OPT_BOOLEAN(0, "backup", &opts.backup, "create backup (.orig) file on warning/error"),
+ OPT_STRING(0, "debug-checksum", &opts.debug_checksum, "funcs", "enable checksum debug output"),
+ OPT_BOOLEAN(0, "dry-run", &opts.dryrun, "don't write modifications"),
+ OPT_BOOLEAN(0, "link", &opts.link, "object is a linked object"),
+ OPT_BOOLEAN(0, "module", &opts.module, "object is part of a kernel module"),
+ OPT_BOOLEAN(0, "mnop", &opts.mnop, "nop out mcount call sites"),
+ OPT_BOOLEAN(0, "no-unreachable", &opts.no_unreachable, "skip 'unreachable instruction' warnings"),
+ OPT_STRING('o', "output", &opts.output, "file", "output file name"),
+ OPT_BOOLEAN(0, "sec-address", &opts.sec_address, "print section addresses in warnings"),
+ OPT_BOOLEAN(0, "stats", &opts.stats, "print statistics"),
+ OPT_STRING(0, "trace", &opts.trace, "func", "trace function validation"),
+ OPT_BOOLEAN('v', "verbose", &opts.verbose, "verbose warnings"),
+ OPT_BOOLEAN(0, "werror", &opts.werror, "return error on warnings"),
+ OPT_BOOLEAN(0, "wide", &opts.wide, "wide output"),
OPT_END(),
};
@@ -159,7 +165,21 @@ static bool opts_valid(void)
return false;
}
- if (opts.hack_jump_label ||
+#ifndef BUILD_KLP
+ if (opts.checksum) {
+ ERROR("--checksum not supported; install xxhash-devel/libxxhash-dev (version >= 0.8) and recompile");
+ return false;
+ }
+#endif
+
+ if (opts.debug_checksum && !opts.checksum) {
+ ERROR("--debug-checksum requires --checksum");
+ return false;
+ }
+
+ if (opts.checksum ||
+ opts.disas ||
+ opts.hack_jump_label ||
opts.hack_noinstr ||
opts.ibt ||
opts.mcount ||
@@ -243,15 +263,12 @@ static void save_argv(int argc, const char **argv)
ERROR_GLIBC("strdup(%s)", argv[i]);
exit(1);
}
- };
+ }
}
-void print_args(void)
+int make_backup(void)
{
- char *backup = NULL;
-
- if (opts.output || opts.dryrun)
- goto print;
+ char *backup;
/*
* Make a backup before kbuild deletes the file so the error
@@ -260,33 +277,32 @@ void print_args(void)
backup = malloc(strlen(objname) + strlen(ORIG_SUFFIX) + 1);
if (!backup) {
ERROR_GLIBC("malloc");
- goto print;
+ return 1;
}
strcpy(backup, objname);
strcat(backup, ORIG_SUFFIX);
- if (copy_file(objname, backup)) {
- backup = NULL;
- goto print;
- }
+ if (copy_file(objname, backup))
+ return 1;
-print:
/*
- * Print the cmdline args to make it easier to recreate. If '--output'
- * wasn't used, add it to the printed args with the backup as input.
+ * Print the cmdline args to make it easier to recreate.
*/
+
fprintf(stderr, "%s", orig_argv[0]);
for (int i = 1; i < orig_argc; i++) {
char *arg = orig_argv[i];
- if (backup && !strcmp(arg, objname))
+ /* Modify the printed args to use the backup */
+ if (!opts.output && !strcmp(arg, objname))
fprintf(stderr, " %s -o %s", backup, objname);
else
fprintf(stderr, " %s", arg);
}
fprintf(stderr, "\n");
+ return 0;
}
int objtool_run(int argc, const char **argv)
@@ -332,5 +348,5 @@ int objtool_run(int argc, const char **argv)
if (!opts.dryrun && file->elf->changed && elf_write(file->elf))
return 1;
- return 0;
+ return elf_close(file->elf);
}
diff --git a/tools/objtool/builtin-klp.c b/tools/objtool/builtin-klp.c
new file mode 100644
index 000000000000..56d5a5b92f72
--- /dev/null
+++ b/tools/objtool/builtin-klp.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <subcmd/parse-options.h>
+#include <string.h>
+#include <stdlib.h>
+#include <objtool/builtin.h>
+#include <objtool/objtool.h>
+#include <objtool/klp.h>
+
+struct subcmd {
+ const char *name;
+ const char *description;
+ int (*fn)(int, const char **);
+};
+
+static struct subcmd subcmds[] = {
+ { "diff", "Generate binary diff of two object files", cmd_klp_diff, },
+ { "post-link", "Finalize klp symbols/relocs after module linking", cmd_klp_post_link, },
+};
+
+static void cmd_klp_usage(void)
+{
+ fprintf(stderr, "usage: objtool klp <subcommand> [<options>]\n\n");
+ fprintf(stderr, "Subcommands:\n");
+
+ for (int i = 0; i < ARRAY_SIZE(subcmds); i++) {
+ struct subcmd *cmd = &subcmds[i];
+
+ fprintf(stderr, " %s\t%s\n", cmd->name, cmd->description);
+ }
+
+ exit(1);
+}
+
+int cmd_klp(int argc, const char **argv)
+{
+ argc--;
+ argv++;
+
+ if (!argc)
+ cmd_klp_usage();
+
+ if (argc) {
+ for (int i = 0; i < ARRAY_SIZE(subcmds); i++) {
+ struct subcmd *cmd = &subcmds[i];
+
+ if (!strcmp(cmd->name, argv[0]))
+ return cmd->fn(argc, argv);
+ }
+ }
+
+ cmd_klp_usage();
+ return 0;
+}
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 620854fdaaf6..9ec0e07cce90 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -3,6 +3,8 @@
* Copyright (C) 2015-2017 Josh Poimboeuf <jpoimboe@redhat.com>
*/
+#define _GNU_SOURCE /* memmem() */
+#include <fnmatch.h>
#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
@@ -11,10 +13,13 @@
#include <objtool/builtin.h>
#include <objtool/cfi.h>
#include <objtool/arch.h>
+#include <objtool/disas.h>
#include <objtool/check.h>
#include <objtool/special.h>
+#include <objtool/trace.h>
#include <objtool/warn.h>
-#include <objtool/endianness.h>
+#include <objtool/checksum.h>
+#include <objtool/util.h>
#include <linux/objtool_types.h>
#include <linux/hashtable.h>
@@ -22,11 +27,6 @@
#include <linux/static_call_types.h>
#include <linux/string.h>
-struct alternative {
- struct alternative *next;
- struct instruction *insn;
-};
-
static unsigned long nr_cfi, nr_cfi_reused, nr_cfi_cache;
static struct cfi_init_state initial_func_cfi;
@@ -34,6 +34,10 @@ static struct cfi_state init_cfi;
static struct cfi_state func_cfi;
static struct cfi_state force_undefined_cfi;
+struct disas_context *objtool_disas_ctx;
+
+size_t sym_name_max_len;
+
struct instruction *find_insn(struct objtool_file *file,
struct section *sec, unsigned long offset)
{
@@ -106,7 +110,7 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file,
#define for_each_insn(file, insn) \
for (struct section *__sec, *__fake = (struct section *)1; \
__fake; __fake = NULL) \
- for_each_sec(file, __sec) \
+ for_each_sec(file->elf, __sec) \
sec_for_each_insn(file, __sec, insn)
#define func_for_each_insn(file, func, insn) \
@@ -131,15 +135,6 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file,
for (insn = next_insn_same_sec(file, insn); insn; \
insn = next_insn_same_sec(file, insn))
-static inline struct symbol *insn_call_dest(struct instruction *insn)
-{
- if (insn->type == INSN_JUMP_DYNAMIC ||
- insn->type == INSN_CALL_DYNAMIC)
- return NULL;
-
- return insn->_call_dest;
-}
-
static inline struct reloc *insn_jump_table(struct instruction *insn)
{
if (insn->type == INSN_JUMP_DYNAMIC ||
@@ -186,20 +181,6 @@ static bool is_sibling_call(struct instruction *insn)
}
/*
- * Checks if a string ends with another.
- */
-static bool str_ends_with(const char *s, const char *sub)
-{
- const int slen = strlen(s);
- const int sublen = strlen(sub);
-
- if (sublen > slen)
- return 0;
-
- return !memcmp(s + slen - sublen, sub, sublen);
-}
-
-/*
* Checks if a function is a Rust "noreturn" one.
*/
static bool is_rust_noreturn(const struct symbol *func)
@@ -262,7 +243,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
if (!func)
return false;
- if (func->bind == STB_GLOBAL || func->bind == STB_WEAK) {
+ if (!is_local_sym(func)) {
if (is_rust_noreturn(func))
return true;
@@ -271,7 +252,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
return true;
}
- if (func->bind == STB_WEAK)
+ if (is_weak_sym(func))
return false;
if (!func->len)
@@ -431,14 +412,13 @@ static int decode_instructions(struct objtool_file *file)
struct symbol *func;
unsigned long offset;
struct instruction *insn;
- int ret;
- for_each_sec(file, sec) {
+ for_each_sec(file->elf, sec) {
struct instruction *insns = NULL;
u8 prev_len = 0;
u8 idx = 0;
- if (!(sec->sh.sh_flags & SHF_EXECINSTR))
+ if (!is_text_sec(sec))
continue;
if (strcmp(sec->name, ".altinstr_replacement") &&
@@ -461,9 +441,9 @@ static int decode_instructions(struct objtool_file *file)
if (!strcmp(sec->name, ".init.text") && !opts.module)
sec->init = true;
- for (offset = 0; offset < sec->sh.sh_size; offset += insn->len) {
+ for (offset = 0; offset < sec_size(sec); offset += insn->len) {
if (!insns || idx == INSN_CHUNK_MAX) {
- insns = calloc(sizeof(*insn), INSN_CHUNK_SIZE);
+ insns = calloc(INSN_CHUNK_SIZE, sizeof(*insn));
if (!insns) {
ERROR_GLIBC("calloc");
return -1;
@@ -480,11 +460,8 @@ static int decode_instructions(struct objtool_file *file)
insn->offset = offset;
insn->prev_len = prev_len;
- ret = arch_decode_instruction(file, sec, offset,
- sec->sh.sh_size - offset,
- insn);
- if (ret)
- return ret;
+ if (arch_decode_instruction(file, sec, offset, sec_size(sec) - offset, insn))
+ return -1;
prev_len = insn->len;
@@ -501,12 +478,12 @@ static int decode_instructions(struct objtool_file *file)
}
sec_for_each_sym(sec, func) {
- if (func->type != STT_NOTYPE && func->type != STT_FUNC)
+ if (!is_notype_sym(func) && !is_func_sym(func))
continue;
- if (func->offset == sec->sh.sh_size) {
+ if (func->offset == sec_size(sec)) {
/* Heuristic: likely an "end" symbol */
- if (func->type == STT_NOTYPE)
+ if (is_notype_sym(func))
continue;
ERROR("%s(): STT_FUNC at end of section", func->name);
return -1;
@@ -522,7 +499,7 @@ static int decode_instructions(struct objtool_file *file)
sym_for_each_insn(file, func, insn) {
insn->sym = func;
- if (func->type == STT_FUNC &&
+ if (is_func_sym(func) &&
insn->type == INSN_ENDBR &&
list_empty(&insn->call_node)) {
if (insn->offset == func->offset) {
@@ -566,7 +543,7 @@ static int add_pv_ops(struct objtool_file *file, const char *symname)
idx = (reloc_offset(reloc) - sym->offset) / sizeof(unsigned long);
func = reloc->sym;
- if (func->type == STT_SECTION)
+ if (is_sec_sym(func))
func = find_symbol_by_offset(reloc->sym->sec,
reloc_addend(reloc));
if (!func) {
@@ -600,7 +577,7 @@ static int init_pv_ops(struct objtool_file *file)
};
const char *pv_ops;
struct symbol *sym;
- int idx, nr, ret;
+ int idx, nr;
if (!opts.noinstr)
return 0;
@@ -612,7 +589,7 @@ static int init_pv_ops(struct objtool_file *file)
return 0;
nr = sym->len / sizeof(unsigned long);
- file->pv_ops = calloc(sizeof(struct pv_state), nr);
+ file->pv_ops = calloc(nr, sizeof(struct pv_state));
if (!file->pv_ops) {
ERROR_GLIBC("calloc");
return -1;
@@ -622,14 +599,27 @@ static int init_pv_ops(struct objtool_file *file)
INIT_LIST_HEAD(&file->pv_ops[idx].targets);
for (idx = 0; (pv_ops = pv_ops_tables[idx]); idx++) {
- ret = add_pv_ops(file, pv_ops);
- if (ret)
- return ret;
+ if (add_pv_ops(file, pv_ops))
+ return -1;
}
return 0;
}
+static bool is_livepatch_module(struct objtool_file *file)
+{
+ struct section *sec;
+
+ if (!opts.module)
+ return false;
+
+ sec = find_section_by_name(file->elf, ".modinfo");
+ if (!sec)
+ return false;
+
+ return memmem(sec->data->d_buf, sec_size(sec), "\0livepatch=Y", 12);
+}
+
static int create_static_call_sections(struct objtool_file *file)
{
struct static_call_site *site;
@@ -641,8 +631,14 @@ static int create_static_call_sections(struct objtool_file *file)
sec = find_section_by_name(file->elf, ".static_call_sites");
if (sec) {
- INIT_LIST_HEAD(&file->static_call_list);
- WARN("file already has .static_call_sites section, skipping");
+ /*
+ * Livepatch modules may have already extracted the static call
+ * site entries to take advantage of vmlinux static call
+ * privileges.
+ */
+ if (!file->klp)
+ WARN("file already has .static_call_sites section, skipping");
+
return 0;
}
@@ -686,7 +682,7 @@ static int create_static_call_sections(struct objtool_file *file)
key_sym = find_symbol_by_name(file->elf, tmp);
if (!key_sym) {
- if (!opts.module) {
+ if (!opts.module || file->klp) {
ERROR("static_call: can't find static_call_key symbol: %s", tmp);
return -1;
}
@@ -829,7 +825,7 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file)
struct symbol *sym = insn->sym;
*site = 0;
- if (opts.module && sym && sym->type == STT_FUNC &&
+ if (opts.module && sym && is_func_sym(sym) &&
insn->offset == sym->offset &&
(!strcmp(sym->name, "init_module") ||
!strcmp(sym->name, "cleanup_module"))) {
@@ -857,14 +853,13 @@ static int create_cfi_sections(struct objtool_file *file)
sec = find_section_by_name(file->elf, ".cfi_sites");
if (sec) {
- INIT_LIST_HEAD(&file->call_list);
WARN("file already has .cfi_sites section, skipping");
return 0;
}
idx = 0;
- for_each_sym(file, sym) {
- if (sym->type != STT_FUNC)
+ for_each_sym(file->elf, sym) {
+ if (!is_func_sym(sym))
continue;
if (strncmp(sym->name, "__cfi_", 6))
@@ -879,8 +874,8 @@ static int create_cfi_sections(struct objtool_file *file)
return -1;
idx = 0;
- for_each_sym(file, sym) {
- if (sym->type != STT_FUNC)
+ for_each_sym(file->elf, sym) {
+ if (!is_func_sym(sym))
continue;
if (strncmp(sym->name, "__cfi_", 6))
@@ -906,8 +901,13 @@ static int create_mcount_loc_sections(struct objtool_file *file)
sec = find_section_by_name(file->elf, "__mcount_loc");
if (sec) {
- INIT_LIST_HEAD(&file->mcount_loc_list);
- WARN("file already has __mcount_loc section, skipping");
+ /*
+ * Livepatch modules have already extracted their __mcount_loc
+ * entries to cover the !CONFIG_FTRACE_MCOUNT_USE_OBJTOOL case.
+ */
+ if (!file->klp)
+ WARN("file already has __mcount_loc section, skipping");
+
return 0;
}
@@ -951,7 +951,6 @@ static int create_direct_call_sections(struct objtool_file *file)
sec = find_section_by_name(file->elf, ".call_sites");
if (sec) {
- INIT_LIST_HEAD(&file->call_list);
WARN("file already has .call_sites section, skipping");
return 0;
}
@@ -982,6 +981,59 @@ static int create_direct_call_sections(struct objtool_file *file)
return 0;
}
+#ifdef BUILD_KLP
+static int create_sym_checksum_section(struct objtool_file *file)
+{
+ struct section *sec;
+ struct symbol *sym;
+ unsigned int idx = 0;
+ struct sym_checksum *checksum;
+ size_t entsize = sizeof(struct sym_checksum);
+
+ sec = find_section_by_name(file->elf, ".discard.sym_checksum");
+ if (sec) {
+ if (!opts.dryrun)
+ WARN("file already has .discard.sym_checksum section, skipping");
+
+ return 0;
+ }
+
+ for_each_sym(file->elf, sym)
+ if (sym->csum.checksum)
+ idx++;
+
+ if (!idx)
+ return 0;
+
+ sec = elf_create_section_pair(file->elf, ".discard.sym_checksum", entsize,
+ idx, idx);
+ if (!sec)
+ return -1;
+
+ idx = 0;
+ for_each_sym(file->elf, sym) {
+ if (!sym->csum.checksum)
+ continue;
+
+ if (!elf_init_reloc(file->elf, sec->rsec, idx, idx * entsize,
+ sym, 0, R_TEXT64))
+ return -1;
+
+ checksum = (struct sym_checksum *)sec->data->d_buf + idx;
+ checksum->addr = 0; /* reloc */
+ checksum->checksum = sym->csum.checksum;
+
+ mark_sec_changed(file->elf, sec, true);
+
+ idx++;
+ }
+
+ return 0;
+}
+#else
+static int create_sym_checksum_section(struct objtool_file *file) { return -EINVAL; }
+#endif
+
/*
* Warnings shouldn't be reported for ignored functions.
*/
@@ -1433,9 +1485,14 @@ static void add_return_call(struct objtool_file *file, struct instruction *insn,
}
static bool is_first_func_insn(struct objtool_file *file,
- struct instruction *insn, struct symbol *sym)
+ struct instruction *insn)
{
- if (insn->offset == sym->offset)
+ struct symbol *func = insn_func(insn);
+
+ if (!func)
+ return false;
+
+ if (insn->offset == func->offset)
return true;
/* Allow direct CALL/JMP past ENDBR */
@@ -1443,7 +1500,7 @@ static bool is_first_func_insn(struct objtool_file *file,
struct instruction *prev = prev_insn_same_sym(file, insn);
if (prev && prev->type == INSN_ENDBR &&
- insn->offset == sym->offset + prev->len)
+ insn->offset == func->offset + prev->len)
return true;
}
@@ -1451,44 +1508,22 @@ static bool is_first_func_insn(struct objtool_file *file,
}
/*
- * A sibling call is a tail-call to another symbol -- to differentiate from a
- * recursive tail-call which is to the same symbol.
- */
-static bool jump_is_sibling_call(struct objtool_file *file,
- struct instruction *from, struct instruction *to)
-{
- struct symbol *fs = from->sym;
- struct symbol *ts = to->sym;
-
- /* Not a sibling call if from/to a symbol hole */
- if (!fs || !ts)
- return false;
-
- /* Not a sibling call if not targeting the start of a symbol. */
- if (!is_first_func_insn(file, to, ts))
- return false;
-
- /* Disallow sibling calls into STT_NOTYPE */
- if (ts->type == STT_NOTYPE)
- return false;
-
- /* Must not be self to be a sibling */
- return fs->pfunc != ts->pfunc;
-}
-
-/*
* Find the destination instructions for all jumps.
*/
static int add_jump_destinations(struct objtool_file *file)
{
- struct instruction *insn, *jump_dest;
+ struct instruction *insn;
struct reloc *reloc;
- struct section *dest_sec;
- unsigned long dest_off;
- int ret;
for_each_insn(file, insn) {
struct symbol *func = insn_func(insn);
+ struct instruction *dest_insn;
+ struct section *dest_sec;
+ struct symbol *dest_sym;
+ unsigned long dest_off;
+
+ if (!is_static_jump(insn))
+ continue;
if (insn->jump_dest) {
/*
@@ -1497,53 +1532,53 @@ static int add_jump_destinations(struct objtool_file *file)
*/
continue;
}
- if (!is_static_jump(insn))
- continue;
reloc = insn_reloc(file, insn);
if (!reloc) {
dest_sec = insn->sec;
dest_off = arch_jump_destination(insn);
- } else if (reloc->sym->type == STT_SECTION) {
- dest_sec = reloc->sym->sec;
- dest_off = arch_dest_reloc_offset(reloc_addend(reloc));
- } else if (reloc->sym->retpoline_thunk) {
- ret = add_retpoline_call(file, insn);
- if (ret)
- return ret;
- continue;
- } else if (reloc->sym->return_thunk) {
- add_return_call(file, insn, true);
- continue;
- } else if (func) {
- /*
- * External sibling call or internal sibling call with
- * STT_FUNC reloc.
- */
- ret = add_call_dest(file, insn, reloc->sym, true);
- if (ret)
- return ret;
- continue;
- } else if (reloc->sym->sec->idx) {
- dest_sec = reloc->sym->sec;
- dest_off = reloc->sym->sym.st_value +
- arch_dest_reloc_offset(reloc_addend(reloc));
+ dest_sym = dest_sec->sym;
} else {
- /* non-func asm code jumping to another file */
- continue;
+ dest_sym = reloc->sym;
+ if (is_undef_sym(dest_sym)) {
+ if (dest_sym->retpoline_thunk) {
+ if (add_retpoline_call(file, insn))
+ return -1;
+ continue;
+ }
+
+ if (dest_sym->return_thunk) {
+ add_return_call(file, insn, true);
+ continue;
+ }
+
+ /* External symbol */
+ if (func) {
+ /* External sibling call */
+ if (add_call_dest(file, insn, dest_sym, true))
+ return -1;
+ continue;
+ }
+
+ /* Non-func asm code jumping to external symbol */
+ continue;
+ }
+
+ dest_sec = dest_sym->sec;
+ dest_off = dest_sym->offset + arch_insn_adjusted_addend(insn, reloc);
}
- jump_dest = find_insn(file, dest_sec, dest_off);
- if (!jump_dest) {
+ dest_insn = find_insn(file, dest_sec, dest_off);
+ if (!dest_insn) {
struct symbol *sym = find_symbol_by_offset(dest_sec, dest_off);
/*
- * This is a special case for retbleed_untrain_ret().
- * It jumps to __x86_return_thunk(), but objtool
- * can't find the thunk's starting RET
- * instruction, because the RET is also in the
- * middle of another instruction. Objtool only
- * knows about the outer instruction.
+ * retbleed_untrain_ret() jumps to
+ * __x86_return_thunk(), but objtool can't find
+ * the thunk's starting RET instruction,
+ * because the RET is also in the middle of
+ * another instruction. Objtool only knows
+ * about the outer instruction.
*/
if (sym && sym->embedded_insn) {
add_return_call(file, insn, false);
@@ -1551,76 +1586,52 @@ static int add_jump_destinations(struct objtool_file *file)
}
/*
- * GCOV/KCOV dead code can jump to the end of the
- * function/section.
+ * GCOV/KCOV dead code can jump to the end of
+ * the function/section.
*/
if (file->ignore_unreachables && func &&
dest_sec == insn->sec &&
dest_off == func->offset + func->len)
continue;
- ERROR_INSN(insn, "can't find jump dest instruction at %s+0x%lx",
- dest_sec->name, dest_off);
+ ERROR_INSN(insn, "can't find jump dest instruction at %s",
+ offstr(dest_sec, dest_off));
return -1;
}
- /*
- * An intra-TU jump in retpoline.o might not have a relocation
- * for its jump dest, in which case the above
- * add_{retpoline,return}_call() didn't happen.
- */
- if (jump_dest->sym && jump_dest->offset == jump_dest->sym->offset) {
- if (jump_dest->sym->retpoline_thunk) {
- ret = add_retpoline_call(file, insn);
- if (ret)
- return ret;
- continue;
- }
- if (jump_dest->sym->return_thunk) {
- add_return_call(file, insn, true);
- continue;
- }
+ if (!dest_sym || is_sec_sym(dest_sym)) {
+ dest_sym = dest_insn->sym;
+ if (!dest_sym)
+ goto set_jump_dest;
}
- /*
- * Cross-function jump.
- */
- if (func && insn_func(jump_dest) && func != insn_func(jump_dest)) {
+ if (dest_sym->retpoline_thunk && dest_insn->offset == dest_sym->offset) {
+ if (add_retpoline_call(file, insn))
+ return -1;
+ continue;
+ }
- /*
- * For GCC 8+, create parent/child links for any cold
- * subfunctions. This is _mostly_ redundant with a
- * similar initialization in read_symbols().
- *
- * If a function has aliases, we want the *first* such
- * function in the symbol table to be the subfunction's
- * parent. In that case we overwrite the
- * initialization done in read_symbols().
- *
- * However this code can't completely replace the
- * read_symbols() code because this doesn't detect the
- * case where the parent function's only reference to a
- * subfunction is through a jump table.
- */
- if (!strstr(func->name, ".cold") &&
- strstr(insn_func(jump_dest)->name, ".cold")) {
- func->cfunc = insn_func(jump_dest);
- insn_func(jump_dest)->pfunc = func;
- }
+ if (dest_sym->return_thunk && dest_insn->offset == dest_sym->offset) {
+ add_return_call(file, insn, true);
+ continue;
}
- if (jump_is_sibling_call(file, insn, jump_dest)) {
- /*
- * Internal sibling call without reloc or with
- * STT_SECTION reloc.
- */
- ret = add_call_dest(file, insn, insn_func(jump_dest), true);
- if (ret)
- return ret;
+ if (!insn->sym || insn->sym->pfunc == dest_sym->pfunc)
+ goto set_jump_dest;
+
+ /*
+ * Internal cross-function jump.
+ */
+
+ if (is_first_func_insn(file, dest_insn)) {
+ /* Internal sibling call */
+ if (add_call_dest(file, insn, dest_sym, true))
+ return -1;
continue;
}
- insn->jump_dest = jump_dest;
+set_jump_dest:
+ insn->jump_dest = dest_insn;
}
return 0;
@@ -1646,7 +1657,6 @@ static int add_call_destinations(struct objtool_file *file)
unsigned long dest_off;
struct symbol *dest;
struct reloc *reloc;
- int ret;
for_each_insn(file, insn) {
struct symbol *func = insn_func(insn);
@@ -1658,9 +1668,8 @@ static int add_call_destinations(struct objtool_file *file)
dest_off = arch_jump_destination(insn);
dest = find_call_destination(insn->sec, dest_off);
- ret = add_call_dest(file, insn, dest, false);
- if (ret)
- return ret;
+ if (add_call_dest(file, insn, dest, false))
+ return -1;
if (func && func->ignore)
continue;
@@ -1670,13 +1679,13 @@ static int add_call_destinations(struct objtool_file *file)
return -1;
}
- if (func && insn_call_dest(insn)->type != STT_FUNC) {
+ if (func && !is_func_sym(insn_call_dest(insn))) {
ERROR_INSN(insn, "unsupported call to non-function");
return -1;
}
- } else if (reloc->sym->type == STT_SECTION) {
- dest_off = arch_dest_reloc_offset(reloc_addend(reloc));
+ } else if (is_sec_sym(reloc->sym)) {
+ dest_off = arch_insn_adjusted_addend(insn, reloc);
dest = find_call_destination(reloc->sym->sec, dest_off);
if (!dest) {
ERROR_INSN(insn, "can't find call dest symbol at %s+0x%lx",
@@ -1684,19 +1693,16 @@ static int add_call_destinations(struct objtool_file *file)
return -1;
}
- ret = add_call_dest(file, insn, dest, false);
- if (ret)
- return ret;
+ if (add_call_dest(file, insn, dest, false))
+ return -1;
} else if (reloc->sym->retpoline_thunk) {
- ret = add_retpoline_call(file, insn);
- if (ret)
- return ret;
+ if (add_retpoline_call(file, insn))
+ return -1;
} else {
- ret = add_call_dest(file, insn, reloc->sym, false);
- if (ret)
- return ret;
+ if (add_call_dest(file, insn, reloc->sym, false))
+ return -1;
}
}
@@ -1745,6 +1751,7 @@ static int handle_group_alt(struct objtool_file *file,
orig_alt_group->last_insn = last_orig_insn;
orig_alt_group->nop = NULL;
orig_alt_group->ignore = orig_insn->ignore_alts;
+ orig_alt_group->feature = 0;
} else {
if (orig_alt_group->last_insn->offset + orig_alt_group->last_insn->len -
orig_alt_group->first_insn->offset != special_alt->orig_len) {
@@ -1784,6 +1791,7 @@ static int handle_group_alt(struct objtool_file *file,
nop->type = INSN_NOP;
nop->sym = orig_insn->sym;
nop->alt_group = new_alt_group;
+ nop->fake = 1;
}
if (!special_alt->new_len) {
@@ -1848,6 +1856,7 @@ end:
new_alt_group->nop = nop;
new_alt_group->ignore = (*new_insn)->ignore_alts;
new_alt_group->cfi = orig_alt_group->cfi;
+ new_alt_group->feature = special_alt->feature;
return 0;
}
@@ -1912,8 +1921,9 @@ static int add_special_section_alts(struct objtool_file *file)
struct list_head special_alts;
struct instruction *orig_insn, *new_insn;
struct special_alt *special_alt, *tmp;
+ enum alternative_type alt_type;
struct alternative *alt;
- int ret;
+ struct alternative *a;
if (special_get_alts(file->elf, &special_alts))
return -1;
@@ -1945,16 +1955,18 @@ static int add_special_section_alts(struct objtool_file *file)
continue;
}
- ret = handle_group_alt(file, special_alt, orig_insn,
- &new_insn);
- if (ret)
- return ret;
+ if (handle_group_alt(file, special_alt, orig_insn, &new_insn))
+ return -1;
+
+ alt_type = ALT_TYPE_INSTRUCTIONS;
} else if (special_alt->jump_or_nop) {
- ret = handle_jump_alt(file, special_alt, orig_insn,
- &new_insn);
- if (ret)
- return ret;
+ if (handle_jump_alt(file, special_alt, orig_insn, &new_insn))
+ return -1;
+
+ alt_type = ALT_TYPE_JUMP_TABLE;
+ } else {
+ alt_type = ALT_TYPE_EX_TABLE;
}
alt = calloc(1, sizeof(*alt));
@@ -1964,8 +1976,20 @@ static int add_special_section_alts(struct objtool_file *file)
}
alt->insn = new_insn;
- alt->next = orig_insn->alts;
- orig_insn->alts = alt;
+ alt->type = alt_type;
+ alt->next = NULL;
+
+ /*
+ * Store alternatives in the same order they have been
+ * defined.
+ */
+ if (!orig_insn->alts) {
+ orig_insn->alts = alt;
+ } else {
+ for (a = orig_insn->alts; a->next; a = a->next)
+ ;
+ a->next = alt;
+ }
list_del(&special_alt->list);
free(special_alt);
@@ -2142,15 +2166,13 @@ static int add_func_jump_tables(struct objtool_file *file,
struct symbol *func)
{
struct instruction *insn;
- int ret;
func_for_each_insn(file, func, insn) {
if (!insn_jump_table(insn))
continue;
- ret = add_jump_table(file, insn);
- if (ret)
- return ret;
+ if (add_jump_table(file, insn))
+ return -1;
}
return 0;
@@ -2164,19 +2186,17 @@ static int add_func_jump_tables(struct objtool_file *file,
static int add_jump_table_alts(struct objtool_file *file)
{
struct symbol *func;
- int ret;
if (!file->rodata)
return 0;
- for_each_sym(file, func) {
- if (func->type != STT_FUNC)
+ for_each_sym(file->elf, func) {
+ if (!is_func_sym(func) || func->alias != func)
continue;
mark_func_jump_tables(file, func);
- ret = add_func_jump_tables(file, func);
- if (ret)
- return ret;
+ if (add_func_jump_tables(file, func))
+ return -1;
}
return 0;
@@ -2210,14 +2230,14 @@ static int read_unwind_hints(struct objtool_file *file)
return -1;
}
- if (sec->sh.sh_size % sizeof(struct unwind_hint)) {
+ if (sec_size(sec) % sizeof(struct unwind_hint)) {
ERROR("struct unwind_hint size mismatch");
return -1;
}
file->hints = true;
- for (i = 0; i < sec->sh.sh_size / sizeof(struct unwind_hint); i++) {
+ for (i = 0; i < sec_size(sec) / sizeof(struct unwind_hint); i++) {
hint = (struct unwind_hint *)sec->data->d_buf + i;
reloc = find_reloc_by_dest(file->elf, sec, i * sizeof(*hint));
@@ -2226,14 +2246,7 @@ static int read_unwind_hints(struct objtool_file *file)
return -1;
}
- if (reloc->sym->type == STT_SECTION) {
- offset = reloc_addend(reloc);
- } else if (reloc->sym->local_label) {
- offset = reloc->sym->offset;
- } else {
- ERROR("unexpected relocation symbol type in %s", sec->rsec->name);
- return -1;
- }
+ offset = reloc->sym->offset + reloc_addend(reloc);
insn = find_insn(file, reloc->sym->sec, offset);
if (!insn) {
@@ -2262,7 +2275,7 @@ static int read_unwind_hints(struct objtool_file *file)
if (hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) {
struct symbol *sym = find_symbol_by_offset(insn->sec, insn->offset);
- if (sym && sym->bind == STB_GLOBAL) {
+ if (sym && is_global_sym(sym)) {
if (opts.ibt && insn->type != INSN_ENDBR && !insn->noendbr) {
ERROR_INSN(insn, "UNWIND_HINT_IRET_REGS without ENDBR");
return -1;
@@ -2300,7 +2313,7 @@ static int read_annotate(struct objtool_file *file,
struct instruction *insn;
struct reloc *reloc;
uint64_t offset;
- int type, ret;
+ int type;
sec = find_section_by_name(file->elf, ".discard.annotate_insn");
if (!sec)
@@ -2318,10 +2331,13 @@ static int read_annotate(struct objtool_file *file,
sec->sh.sh_entsize = 8;
}
- for_each_reloc(sec->rsec, reloc) {
- type = *(u32 *)(sec->data->d_buf + (reloc_idx(reloc) * sec->sh.sh_entsize) + 4);
- type = bswap_if_needed(file->elf, type);
+ if (sec_num_entries(sec) != sec_num_entries(sec->rsec)) {
+ ERROR("bad .discard.annotate_insn section: missing relocs");
+ return -1;
+ }
+ for_each_reloc(sec->rsec, reloc) {
+ type = annotype(file->elf, sec, reloc);
offset = reloc->sym->offset + reloc_addend(reloc);
insn = find_insn(file, reloc->sym->sec, offset);
@@ -2330,9 +2346,8 @@ static int read_annotate(struct objtool_file *file,
return -1;
}
- ret = func(file, type, insn);
- if (ret < 0)
- return ret;
+ if (func(file, type, insn))
+ return -1;
}
return 0;
@@ -2471,12 +2486,13 @@ static bool is_profiling_func(const char *name)
static int classify_symbols(struct objtool_file *file)
{
struct symbol *func;
+ size_t len;
- for_each_sym(file, func) {
- if (func->type == STT_NOTYPE && strstarts(func->name, ".L"))
+ for_each_sym(file->elf, func) {
+ if (is_notype_sym(func) && strstarts(func->name, ".L"))
func->local_label = true;
- if (func->bind != STB_GLOBAL)
+ if (!is_global_sym(func))
continue;
if (!strncmp(func->name, STATIC_CALL_TRAMP_PREFIX_STR,
@@ -2497,6 +2513,10 @@ static int classify_symbols(struct objtool_file *file)
if (is_profiling_func(func->name))
func->profiling_func = true;
+
+ len = strlen(func->name);
+ if (len > sym_name_max_len)
+ sym_name_max_len = len;
}
return 0;
@@ -2517,7 +2537,7 @@ static void mark_rodata(struct objtool_file *file)
*
* .rodata.str1.* sections are ignored; they don't contain jump tables.
*/
- for_each_sec(file, sec) {
+ for_each_sec(file->elf, sec) {
if ((!strncmp(sec->name, ".rodata", 7) &&
!strstr(sec->name, ".str1.")) ||
!strncmp(sec->name, ".data.rel.ro", 12)) {
@@ -2529,78 +2549,115 @@ static void mark_rodata(struct objtool_file *file)
file->rodata = found;
}
+static void mark_holes(struct objtool_file *file)
+{
+ struct instruction *insn;
+ bool in_hole = false;
+
+ if (!opts.link)
+ return;
+
+ /*
+ * Whole archive runs might encounter dead code from weak symbols.
+ * This is where the linker will have dropped the weak symbol in
+ * favour of a regular symbol, but leaves the code in place.
+ */
+ for_each_insn(file, insn) {
+ if (insn->sym || !find_symbol_hole_containing(insn->sec, insn->offset)) {
+ in_hole = false;
+ continue;
+ }
+
+ /* Skip function padding and pfx code */
+ if (!in_hole && insn->type == INSN_NOP)
+ continue;
+
+ in_hole = true;
+ insn->hole = 1;
+
+ /*
+ * If this hole jumps to a .cold function, mark it ignore.
+ */
+ if (insn->jump_dest) {
+ struct symbol *dest_func = insn_func(insn->jump_dest);
+
+ if (dest_func && dest_func->cold)
+ dest_func->ignore = true;
+ }
+ }
+}
+
+static bool validate_branch_enabled(void)
+{
+ return opts.stackval ||
+ opts.orc ||
+ opts.uaccess ||
+ opts.checksum;
+}
+
static int decode_sections(struct objtool_file *file)
{
- int ret;
+ file->klp = is_livepatch_module(file);
mark_rodata(file);
- ret = init_pv_ops(file);
- if (ret)
- return ret;
+ if (init_pv_ops(file))
+ return -1;
/*
* Must be before add_{jump_call}_destination.
*/
- ret = classify_symbols(file);
- if (ret)
- return ret;
+ if (classify_symbols(file))
+ return -1;
- ret = decode_instructions(file);
- if (ret)
- return ret;
+ if (decode_instructions(file))
+ return -1;
- ret = add_ignores(file);
- if (ret)
- return ret;
+ if (add_ignores(file))
+ return -1;
add_uaccess_safe(file);
- ret = read_annotate(file, __annotate_early);
- if (ret)
- return ret;
+ if (read_annotate(file, __annotate_early))
+ return -1;
/*
* Must be before add_jump_destinations(), which depends on 'func'
* being set for alternatives, to enable proper sibling call detection.
*/
- if (opts.stackval || opts.orc || opts.uaccess || opts.noinstr) {
- ret = add_special_section_alts(file);
- if (ret)
- return ret;
+ if (validate_branch_enabled() || opts.noinstr || opts.hack_jump_label || opts.disas) {
+ if (add_special_section_alts(file))
+ return -1;
}
- ret = add_jump_destinations(file);
- if (ret)
- return ret;
+ if (add_jump_destinations(file))
+ return -1;
/*
* Must be before add_call_destination(); it changes INSN_CALL to
* INSN_JUMP.
*/
- ret = read_annotate(file, __annotate_ifc);
- if (ret)
- return ret;
+ if (read_annotate(file, __annotate_ifc))
+ return -1;
- ret = add_call_destinations(file);
- if (ret)
- return ret;
+ if (add_call_destinations(file))
+ return -1;
- ret = add_jump_table_alts(file);
- if (ret)
- return ret;
+ if (add_jump_table_alts(file))
+ return -1;
- ret = read_unwind_hints(file);
- if (ret)
- return ret;
+ if (read_unwind_hints(file))
+ return -1;
+
+ /* Must be after add_jump_destinations() */
+ mark_holes(file);
/*
* Must be after add_call_destinations() such that it can override
* dead_end_function() marks.
*/
- ret = read_annotate(file, __annotate_late);
- if (ret)
- return ret;
+ if (read_annotate(file, __annotate_late))
+ return -1;
return 0;
}
@@ -3354,7 +3411,7 @@ static bool pv_call_dest(struct objtool_file *file, struct instruction *insn)
if (!reloc || strcmp(reloc->sym->name, "pv_ops"))
return false;
- idx = (arch_dest_reloc_offset(reloc_addend(reloc)) / sizeof(void *));
+ idx = arch_insn_adjusted_addend(insn, reloc) / sizeof(void *);
if (file->pv_ops[idx].clean)
return true;
@@ -3516,9 +3573,14 @@ static bool skip_alt_group(struct instruction *insn)
{
struct instruction *alt_insn = insn->alts ? insn->alts->insn : NULL;
+ if (!insn->alt_group)
+ return false;
+
/* ANNOTATE_IGNORE_ALTERNATIVE */
- if (insn->alt_group && insn->alt_group->ignore)
+ if (insn->alt_group->ignore) {
+ TRACE_ALT(insn, "alt group ignored");
return true;
+ }
/*
* For NOP patched with CLAC/STAC, only follow the latter to avoid
@@ -3540,258 +3602,404 @@ static bool skip_alt_group(struct instruction *insn)
return alt_insn->type == INSN_CLAC || alt_insn->type == INSN_STAC;
}
-/*
- * Follow the branch starting at the given instruction, and recursively follow
- * any other branches (jumps). Meanwhile, track the frame pointer state at
- * each instruction and validate all the rules described in
- * tools/objtool/Documentation/objtool.txt.
- */
-static int validate_branch(struct objtool_file *file, struct symbol *func,
- struct instruction *insn, struct insn_state state)
+static int checksum_debug_init(struct objtool_file *file)
{
- struct alternative *alt;
- struct instruction *next_insn, *prev_insn = NULL;
- struct section *sec;
- u8 visited;
- int ret;
+ char *dup, *s;
- if (func && func->ignore)
+ if (!opts.debug_checksum)
return 0;
- sec = insn->sec;
+ dup = strdup(opts.debug_checksum);
+ if (!dup) {
+ ERROR_GLIBC("strdup");
+ return -1;
+ }
- while (1) {
- next_insn = next_insn_to_validate(file, insn);
+ s = dup;
+ while (*s) {
+ struct symbol *func;
+ char *comma;
- if (func && insn_func(insn) && func != insn_func(insn)->pfunc) {
- /* Ignore KCFI type preambles, which always fall through */
- if (!strncmp(func->name, "__cfi_", 6) ||
- !strncmp(func->name, "__pfx_", 6) ||
- !strncmp(func->name, "__pi___cfi_", 11) ||
- !strncmp(func->name, "__pi___pfx_", 11))
- return 0;
+ comma = strchr(s, ',');
+ if (comma)
+ *comma = '\0';
- if (file->ignore_unreachables)
- return 0;
+ func = find_symbol_by_name(file->elf, s);
+ if (!func || !is_func_sym(func))
+ WARN("--debug-checksum: can't find '%s'", s);
+ else
+ func->debug_checksum = 1;
- WARN("%s() falls through to next function %s()",
- func->name, insn_func(insn)->name);
- func->warned = 1;
+ if (!comma)
+ break;
- return 1;
- }
+ s = comma + 1;
+ }
- visited = VISITED_BRANCH << state.uaccess;
- if (insn->visited & VISITED_BRANCH_MASK) {
- if (!insn->hint && !insn_cfi_match(insn, &state.cfi))
- return 1;
+ free(dup);
+ return 0;
+}
- if (insn->visited & visited)
- return 0;
- } else {
- nr_insns_visited++;
- }
+static void checksum_update_insn(struct objtool_file *file, struct symbol *func,
+ struct instruction *insn)
+{
+ struct reloc *reloc = insn_reloc(file, insn);
+ unsigned long offset;
+ struct symbol *sym;
- if (state.noinstr)
- state.instr += insn->instr;
+ if (insn->fake)
+ return;
- if (insn->hint) {
- if (insn->restore) {
- struct instruction *save_insn, *i;
+ checksum_update(func, insn, insn->sec->data->d_buf + insn->offset, insn->len);
- i = insn;
- save_insn = NULL;
+ if (!reloc) {
+ struct symbol *call_dest = insn_call_dest(insn);
- sym_for_each_insn_continue_reverse(file, func, i) {
- if (i->save) {
- save_insn = i;
- break;
- }
- }
+ if (call_dest)
+ checksum_update(func, insn, call_dest->demangled_name,
+ strlen(call_dest->demangled_name));
+ return;
+ }
- if (!save_insn) {
- WARN_INSN(insn, "no corresponding CFI save for CFI restore");
- return 1;
+ sym = reloc->sym;
+ offset = arch_insn_adjusted_addend(insn, reloc);
+
+ if (is_string_sec(sym->sec)) {
+ char *str;
+
+ str = sym->sec->data->d_buf + sym->offset + offset;
+ checksum_update(func, insn, str, strlen(str));
+ return;
+ }
+
+ if (is_sec_sym(sym)) {
+ sym = find_symbol_containing(reloc->sym->sec, offset);
+ if (!sym)
+ return;
+
+ offset -= sym->offset;
+ }
+
+ checksum_update(func, insn, sym->demangled_name, strlen(sym->demangled_name));
+ checksum_update(func, insn, &offset, sizeof(offset));
+}
+
+static int validate_branch(struct objtool_file *file, struct symbol *func,
+ struct instruction *insn, struct insn_state state);
+static int do_validate_branch(struct objtool_file *file, struct symbol *func,
+ struct instruction *insn, struct insn_state state);
+
+static int validate_insn(struct objtool_file *file, struct symbol *func,
+ struct instruction *insn, struct insn_state *statep,
+ struct instruction *prev_insn, struct instruction *next_insn,
+ bool *dead_end)
+{
+ /* prev_state and alt_name are not used if there is no disassembly support */
+ struct insn_state prev_state __maybe_unused;
+ char *alt_name __maybe_unused = NULL;
+ struct alternative *alt;
+ u8 visited;
+ int ret;
+
+ /*
+ * Any returns before the end of this function are effectively dead
+ * ends, i.e. validate_branch() has reached the end of the branch.
+ */
+ *dead_end = true;
+
+ visited = VISITED_BRANCH << statep->uaccess;
+ if (insn->visited & VISITED_BRANCH_MASK) {
+ if (!insn->hint && !insn_cfi_match(insn, &statep->cfi))
+ return 1;
+
+ if (insn->visited & visited) {
+ TRACE_INSN(insn, "already visited");
+ return 0;
+ }
+ } else {
+ nr_insns_visited++;
+ }
+
+ if (statep->noinstr)
+ statep->instr += insn->instr;
+
+ if (insn->hint) {
+ if (insn->restore) {
+ struct instruction *save_insn, *i;
+
+ i = insn;
+ save_insn = NULL;
+
+ sym_for_each_insn_continue_reverse(file, func, i) {
+ if (i->save) {
+ save_insn = i;
+ break;
}
+ }
- if (!save_insn->visited) {
- /*
- * If the restore hint insn is at the
- * beginning of a basic block and was
- * branched to from elsewhere, and the
- * save insn hasn't been visited yet,
- * defer following this branch for now.
- * It will be seen later via the
- * straight-line path.
- */
- if (!prev_insn)
- return 0;
+ if (!save_insn) {
+ WARN_INSN(insn, "no corresponding CFI save for CFI restore");
+ return 1;
+ }
- WARN_INSN(insn, "objtool isn't smart enough to handle this CFI save/restore combo");
- return 1;
+ if (!save_insn->visited) {
+ /*
+ * If the restore hint insn is at the
+ * beginning of a basic block and was
+ * branched to from elsewhere, and the
+ * save insn hasn't been visited yet,
+ * defer following this branch for now.
+ * It will be seen later via the
+ * straight-line path.
+ */
+ if (!prev_insn) {
+ TRACE_INSN(insn, "defer restore");
+ return 0;
}
- insn->cfi = save_insn->cfi;
- nr_cfi_reused++;
+ WARN_INSN(insn, "objtool isn't smart enough to handle this CFI save/restore combo");
+ return 1;
}
- state.cfi = *insn->cfi;
+ insn->cfi = save_insn->cfi;
+ nr_cfi_reused++;
+ }
+
+ statep->cfi = *insn->cfi;
+ } else {
+ /* XXX track if we actually changed statep->cfi */
+
+ if (prev_insn && !cficmp(prev_insn->cfi, &statep->cfi)) {
+ insn->cfi = prev_insn->cfi;
+ nr_cfi_reused++;
} else {
- /* XXX track if we actually changed state.cfi */
+ insn->cfi = cfi_hash_find_or_add(&statep->cfi);
+ }
+ }
- if (prev_insn && !cficmp(prev_insn->cfi, &state.cfi)) {
- insn->cfi = prev_insn->cfi;
- nr_cfi_reused++;
- } else {
- insn->cfi = cfi_hash_find_or_add(&state.cfi);
+ insn->visited |= visited;
+
+ if (propagate_alt_cfi(file, insn))
+ return 1;
+
+ if (insn->alts) {
+ for (alt = insn->alts; alt; alt = alt->next) {
+ TRACE_ALT_BEGIN(insn, alt, alt_name);
+ ret = validate_branch(file, func, alt->insn, *statep);
+ TRACE_ALT_END(insn, alt, alt_name);
+ if (ret) {
+ BT_INSN(insn, "(alt)");
+ return ret;
}
}
+ TRACE_ALT_INFO_NOADDR(insn, "/ ", "DEFAULT");
+ }
+
+ if (skip_alt_group(insn))
+ return 0;
+
+ prev_state = *statep;
+ ret = handle_insn_ops(insn, next_insn, statep);
+ TRACE_INSN_STATE(insn, &prev_state, statep);
+
+ if (ret)
+ return 1;
+
+ switch (insn->type) {
+
+ case INSN_RETURN:
+ TRACE_INSN(insn, "return");
+ return validate_return(func, insn, statep);
+
+ case INSN_CALL:
+ case INSN_CALL_DYNAMIC:
+ if (insn->type == INSN_CALL)
+ TRACE_INSN(insn, "call");
+ else
+ TRACE_INSN(insn, "indirect call");
- insn->visited |= visited;
+ ret = validate_call(file, insn, statep);
+ if (ret)
+ return ret;
- if (propagate_alt_cfi(file, insn))
+ if (opts.stackval && func && !is_special_call(insn) &&
+ !has_valid_stack_frame(statep)) {
+ WARN_INSN(insn, "call without frame pointer save/setup");
return 1;
+ }
- if (insn->alts) {
- for (alt = insn->alts; alt; alt = alt->next) {
- ret = validate_branch(file, func, alt->insn, state);
- if (ret) {
- BT_INSN(insn, "(alt)");
- return ret;
- }
+ break;
+
+ case INSN_JUMP_CONDITIONAL:
+ case INSN_JUMP_UNCONDITIONAL:
+ if (is_sibling_call(insn)) {
+ TRACE_INSN(insn, "sibling call");
+ ret = validate_sibling_call(file, insn, statep);
+ if (ret)
+ return ret;
+
+ } else if (insn->jump_dest) {
+ if (insn->type == INSN_JUMP_UNCONDITIONAL)
+ TRACE_INSN(insn, "unconditional jump");
+ else
+ TRACE_INSN(insn, "jump taken");
+
+ ret = validate_branch(file, func, insn->jump_dest, *statep);
+ if (ret) {
+ BT_INSN(insn, "(branch)");
+ return ret;
}
}
- if (skip_alt_group(insn))
+ if (insn->type == INSN_JUMP_UNCONDITIONAL)
return 0;
- if (handle_insn_ops(insn, next_insn, &state))
- return 1;
-
- switch (insn->type) {
-
- case INSN_RETURN:
- return validate_return(func, insn, &state);
+ TRACE_INSN(insn, "jump not taken");
+ break;
- case INSN_CALL:
- case INSN_CALL_DYNAMIC:
- ret = validate_call(file, insn, &state);
+ case INSN_JUMP_DYNAMIC:
+ case INSN_JUMP_DYNAMIC_CONDITIONAL:
+ TRACE_INSN(insn, "indirect jump");
+ if (is_sibling_call(insn)) {
+ ret = validate_sibling_call(file, insn, statep);
if (ret)
return ret;
+ }
- if (opts.stackval && func && !is_special_call(insn) &&
- !has_valid_stack_frame(&state)) {
- WARN_INSN(insn, "call without frame pointer save/setup");
- return 1;
- }
+ if (insn->type == INSN_JUMP_DYNAMIC)
+ return 0;
- break;
+ break;
- case INSN_JUMP_CONDITIONAL:
- case INSN_JUMP_UNCONDITIONAL:
- if (is_sibling_call(insn)) {
- ret = validate_sibling_call(file, insn, &state);
- if (ret)
- return ret;
+ case INSN_SYSCALL:
+ TRACE_INSN(insn, "syscall");
+ if (func && (!next_insn || !next_insn->hint)) {
+ WARN_INSN(insn, "unsupported instruction in callable function");
+ return 1;
+ }
- } else if (insn->jump_dest) {
- ret = validate_branch(file, func,
- insn->jump_dest, state);
- if (ret) {
- BT_INSN(insn, "(branch)");
- return ret;
- }
- }
+ break;
- if (insn->type == INSN_JUMP_UNCONDITIONAL)
- return 0;
+ case INSN_SYSRET:
+ TRACE_INSN(insn, "sysret");
+ if (func && (!next_insn || !next_insn->hint)) {
+ WARN_INSN(insn, "unsupported instruction in callable function");
+ return 1;
+ }
+ return 0;
+
+ case INSN_STAC:
+ TRACE_INSN(insn, "stac");
+ if (!opts.uaccess)
break;
- case INSN_JUMP_DYNAMIC:
- case INSN_JUMP_DYNAMIC_CONDITIONAL:
- if (is_sibling_call(insn)) {
- ret = validate_sibling_call(file, insn, &state);
- if (ret)
- return ret;
- }
+ if (statep->uaccess) {
+ WARN_INSN(insn, "recursive UACCESS enable");
+ return 1;
+ }
- if (insn->type == INSN_JUMP_DYNAMIC)
- return 0;
+ statep->uaccess = true;
+ break;
+ case INSN_CLAC:
+ TRACE_INSN(insn, "clac");
+ if (!opts.uaccess)
break;
- case INSN_SYSCALL:
- if (func && (!next_insn || !next_insn->hint)) {
- WARN_INSN(insn, "unsupported instruction in callable function");
- return 1;
- }
+ if (!statep->uaccess && func) {
+ WARN_INSN(insn, "redundant UACCESS disable");
+ return 1;
+ }
- break;
+ if (func_uaccess_safe(func) && !statep->uaccess_stack) {
+ WARN_INSN(insn, "UACCESS-safe disables UACCESS");
+ return 1;
+ }
- case INSN_SYSRET:
- if (func && (!next_insn || !next_insn->hint)) {
- WARN_INSN(insn, "unsupported instruction in callable function");
- return 1;
- }
+ statep->uaccess = false;
+ break;
- return 0;
+ case INSN_STD:
+ TRACE_INSN(insn, "std");
+ if (statep->df) {
+ WARN_INSN(insn, "recursive STD");
+ return 1;
+ }
- case INSN_STAC:
- if (!opts.uaccess)
- break;
+ statep->df = true;
+ break;
- if (state.uaccess) {
- WARN_INSN(insn, "recursive UACCESS enable");
- return 1;
- }
+ case INSN_CLD:
+ TRACE_INSN(insn, "cld");
+ if (!statep->df && func) {
+ WARN_INSN(insn, "redundant CLD");
+ return 1;
+ }
- state.uaccess = true;
- break;
+ statep->df = false;
+ break;
- case INSN_CLAC:
- if (!opts.uaccess)
- break;
+ default:
+ break;
+ }
- if (!state.uaccess && func) {
- WARN_INSN(insn, "redundant UACCESS disable");
- return 1;
- }
+ if (insn->dead_end)
+ TRACE_INSN(insn, "dead end");
- if (func_uaccess_safe(func) && !state.uaccess_stack) {
- WARN_INSN(insn, "UACCESS-safe disables UACCESS");
- return 1;
- }
+ *dead_end = insn->dead_end;
+ return 0;
+}
- state.uaccess = false;
- break;
+/*
+ * Follow the branch starting at the given instruction, and recursively follow
+ * any other branches (jumps). Meanwhile, track the frame pointer state at
+ * each instruction and validate all the rules described in
+ * tools/objtool/Documentation/objtool.txt.
+ */
+static int do_validate_branch(struct objtool_file *file, struct symbol *func,
+ struct instruction *insn, struct insn_state state)
+{
+ struct instruction *next_insn, *prev_insn = NULL;
+ bool dead_end;
+ int ret;
- case INSN_STD:
- if (state.df) {
- WARN_INSN(insn, "recursive STD");
- return 1;
- }
+ if (func && func->ignore)
+ return 0;
- state.df = true;
- break;
+ do {
+ insn->trace = 0;
+ next_insn = next_insn_to_validate(file, insn);
- case INSN_CLD:
- if (!state.df && func) {
- WARN_INSN(insn, "redundant CLD");
- return 1;
- }
+ if (opts.checksum && func && insn->sec)
+ checksum_update_insn(file, func, insn);
- state.df = false;
- break;
+ if (func && insn_func(insn) && func != insn_func(insn)->pfunc) {
+ /* Ignore KCFI type preambles, which always fall through */
+ if (is_prefix_func(func))
+ return 0;
- default:
- break;
+ if (file->ignore_unreachables)
+ return 0;
+
+ WARN("%s() falls through to next function %s()",
+ func->name, insn_func(insn)->name);
+ func->warned = 1;
+
+ return 1;
}
- if (insn->dead_end)
- return 0;
+ ret = validate_insn(file, func, insn, &state, prev_insn, next_insn,
+ &dead_end);
+
+ if (!insn->trace) {
+ if (ret)
+ TRACE_INSN(insn, "warning (%d)", ret);
+ else
+ TRACE_INSN(insn, NULL);
+ }
- if (!next_insn) {
+ if (!dead_end && !next_insn) {
if (state.cfi.cfa.base == CFI_UNDEFINED)
return 0;
if (file->ignore_unreachables)
@@ -3799,15 +4007,28 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
WARN("%s%sunexpected end of section %s",
func ? func->name : "", func ? "(): " : "",
- sec->name);
+ insn->sec->name);
return 1;
}
prev_insn = insn;
insn = next_insn;
- }
- return 0;
+ } while (!dead_end);
+
+ return ret;
+}
+
+static int validate_branch(struct objtool_file *file, struct symbol *func,
+ struct instruction *insn, struct insn_state state)
+{
+ int ret;
+
+ trace_depth_inc();
+ ret = do_validate_branch(file, func, insn, state);
+ trace_depth_dec();
+
+ return ret;
}
static int validate_unwind_hint(struct objtool_file *file,
@@ -3815,7 +4036,13 @@ static int validate_unwind_hint(struct objtool_file *file,
struct insn_state *state)
{
if (insn->hint && !insn->visited) {
- int ret = validate_branch(file, insn_func(insn), insn, *state);
+ struct symbol *func = insn_func(insn);
+ int ret;
+
+ if (opts.checksum)
+ checksum_init(func);
+
+ ret = validate_branch(file, func, insn, *state);
if (ret)
BT_INSN(insn, "<=== (hint)");
return ret;
@@ -4059,7 +4286,8 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio
struct instruction *prev_insn;
int i;
- if (insn->type == INSN_NOP || insn->type == INSN_TRAP || (func && func->ignore))
+ if (insn->type == INSN_NOP || insn->type == INSN_TRAP ||
+ insn->hole || (func && func->ignore))
return true;
/*
@@ -4070,47 +4298,6 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio
!strcmp(insn->sec->name, ".altinstr_aux"))
return true;
- /*
- * Whole archive runs might encounter dead code from weak symbols.
- * This is where the linker will have dropped the weak symbol in
- * favour of a regular symbol, but leaves the code in place.
- *
- * In this case we'll find a piece of code (whole function) that is not
- * covered by a !section symbol. Ignore them.
- */
- if (opts.link && !func) {
- int size = find_symbol_hole_containing(insn->sec, insn->offset);
- unsigned long end = insn->offset + size;
-
- if (!size) /* not a hole */
- return false;
-
- if (size < 0) /* hole until the end */
- return true;
-
- sec_for_each_insn_continue(file, insn) {
- /*
- * If we reach a visited instruction at or before the
- * end of the hole, ignore the unreachable.
- */
- if (insn->visited)
- return true;
-
- if (insn->offset >= end)
- break;
-
- /*
- * If this hole jumps to a .cold function, mark it ignore too.
- */
- if (insn->jump_dest && insn_func(insn->jump_dest) &&
- strstr(insn_func(insn->jump_dest)->name, ".cold")) {
- insn_func(insn->jump_dest)->ignore = true;
- }
- }
-
- return false;
- }
-
if (!func)
return false;
@@ -4162,14 +4349,54 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio
return false;
}
-static int add_prefix_symbol(struct objtool_file *file, struct symbol *func)
+/*
+ * For FineIBT or kCFI, a certain number of bytes preceding the function may be
+ * NOPs. Those NOPs may be rewritten at runtime and executed, so give them a
+ * proper function name: __pfx_<func>.
+ *
+ * The NOPs may not exist for the following cases:
+ *
+ * - compiler cloned functions (*.cold, *.part0, etc)
+ * - asm functions created with inline asm or without SYM_FUNC_START()
+ *
+ * Also, the function may already have a prefix from a previous objtool run
+ * (livepatch extracted functions, or manually running objtool multiple times).
+ *
+ * So return 0 if the NOPs are missing or the function already has a prefix
+ * symbol.
+ */
+static int create_prefix_symbol(struct objtool_file *file, struct symbol *func)
{
struct instruction *insn, *prev;
+ char name[SYM_NAME_LEN];
struct cfi_state *cfi;
+ if (!is_func_sym(func) || is_prefix_func(func) ||
+ func->cold || func->static_call_tramp)
+ return 0;
+
+ if ((strlen(func->name) + sizeof("__pfx_") > SYM_NAME_LEN)) {
+ WARN("%s: symbol name too long, can't create __pfx_ symbol",
+ func->name);
+ return 0;
+ }
+
+ if (snprintf_check(name, SYM_NAME_LEN, "__pfx_%s", func->name))
+ return -1;
+
+ if (file->klp) {
+ struct symbol *pfx;
+
+ pfx = find_symbol_by_offset(func->sec, func->offset - opts.prefix);
+ if (pfx && is_prefix_func(pfx) && !strcmp(pfx->name, name))
+ return 0;
+ }
+
insn = find_insn(file, func->sec, func->offset);
- if (!insn)
+ if (!insn) {
+ WARN("%s: can't find starting instruction", func->name);
return -1;
+ }
for (prev = prev_insn_same_sec(file, insn);
prev;
@@ -4177,22 +4404,27 @@ static int add_prefix_symbol(struct objtool_file *file, struct symbol *func)
u64 offset;
if (prev->type != INSN_NOP)
- return -1;
+ return 0;
offset = func->offset - prev->offset;
if (offset > opts.prefix)
- return -1;
+ return 0;
if (offset < opts.prefix)
continue;
- elf_create_prefix_symbol(file->elf, func, opts.prefix);
+ if (!elf_create_symbol(file->elf, name, func->sec,
+ GELF_ST_BIND(func->sym.st_info),
+ GELF_ST_TYPE(func->sym.st_info),
+ prev->offset, opts.prefix))
+ return -1;
+
break;
}
if (!prev)
- return -1;
+ return 0;
if (!insn->cfi) {
/*
@@ -4210,20 +4442,18 @@ static int add_prefix_symbol(struct objtool_file *file, struct symbol *func)
return 0;
}
-static int add_prefix_symbols(struct objtool_file *file)
+static int create_prefix_symbols(struct objtool_file *file)
{
struct section *sec;
struct symbol *func;
- for_each_sec(file, sec) {
- if (!(sec->sh.sh_flags & SHF_EXECINSTR))
+ for_each_sec(file->elf, sec) {
+ if (!is_text_sec(sec))
continue;
sec_for_each_sym(sec, func) {
- if (func->type != STT_FUNC)
- continue;
-
- add_prefix_symbol(file, func);
+ if (create_prefix_symbol(file, func))
+ return -1;
}
}
@@ -4234,6 +4464,7 @@ static int validate_symbol(struct objtool_file *file, struct section *sec,
struct symbol *sym, struct insn_state *state)
{
struct instruction *insn;
+ struct symbol *func;
int ret;
if (!sym->len) {
@@ -4251,9 +4482,26 @@ static int validate_symbol(struct objtool_file *file, struct section *sec,
if (opts.uaccess)
state->uaccess = sym->uaccess_safe;
- ret = validate_branch(file, insn_func(insn), insn, *state);
+ func = insn_func(insn);
+
+ if (opts.checksum)
+ checksum_init(func);
+
+ if (opts.trace && !fnmatch(opts.trace, sym->name, 0)) {
+ trace_enable();
+ TRACE("%s: validation begin\n", sym->name);
+ }
+
+ ret = validate_branch(file, func, insn, *state);
if (ret)
BT_INSN(insn, "<=== (sym)");
+
+ TRACE("%s: validation %s\n\n", sym->name, ret ? "failed" : "end");
+ trace_disable();
+
+ if (opts.checksum)
+ checksum_finish(func);
+
return ret;
}
@@ -4264,7 +4512,7 @@ static int validate_section(struct objtool_file *file, struct section *sec)
int warnings = 0;
sec_for_each_sym(sec, func) {
- if (func->type != STT_FUNC)
+ if (!is_func_sym(func))
continue;
init_insn_state(file, &state, sec);
@@ -4307,8 +4555,8 @@ static int validate_functions(struct objtool_file *file)
struct section *sec;
int warnings = 0;
- for_each_sec(file, sec) {
- if (!(sec->sh.sh_flags & SHF_EXECINSTR))
+ for_each_sec(file->elf, sec) {
+ if (!is_text_sec(sec))
continue;
warnings += validate_section(file, sec);
@@ -4435,12 +4683,7 @@ static int validate_ibt_insn(struct objtool_file *file, struct instruction *insn
reloc_offset(reloc) + 1,
(insn->offset + insn->len) - (reloc_offset(reloc) + 1))) {
- off = reloc->sym->offset;
- if (reloc_type(reloc) == R_X86_64_PC32 ||
- reloc_type(reloc) == R_X86_64_PLT32)
- off += arch_dest_reloc_offset(reloc_addend(reloc));
- else
- off += reloc_addend(reloc);
+ off = reloc->sym->offset + arch_insn_adjusted_addend(insn, reloc);
dest = find_insn(file, reloc->sym->sec, off);
if (!dest)
@@ -4491,10 +4734,10 @@ static int validate_ibt(struct objtool_file *file)
for_each_insn(file, insn)
warnings += validate_ibt_insn(file, insn);
- for_each_sec(file, sec) {
+ for_each_sec(file->elf, sec) {
/* Already done by validate_ibt_insn() */
- if (sec->sh.sh_flags & SHF_EXECINSTR)
+ if (is_text_sec(sec))
continue;
if (!sec->rsec)
@@ -4509,8 +4752,8 @@ static int validate_ibt(struct objtool_file *file)
!strncmp(sec->name, ".debug", 6) ||
!strcmp(sec->name, ".altinstructions") ||
!strcmp(sec->name, ".ibt_endbr_seal") ||
+ !strcmp(sec->name, ".kcfi_traps") ||
!strcmp(sec->name, ".orc_unwind_ip") ||
- !strcmp(sec->name, ".parainstructions") ||
!strcmp(sec->name, ".retpoline_sites") ||
!strcmp(sec->name, ".smp_locks") ||
!strcmp(sec->name, ".static_call_sites") ||
@@ -4519,12 +4762,14 @@ static int validate_ibt(struct objtool_file *file)
!strcmp(sec->name, "__bug_table") ||
!strcmp(sec->name, "__ex_table") ||
!strcmp(sec->name, "__jump_table") ||
+ !strcmp(sec->name, "__klp_funcs") ||
!strcmp(sec->name, "__mcount_loc") ||
- !strcmp(sec->name, ".kcfi_traps") ||
!strcmp(sec->name, ".llvm.call-graph-profile") ||
!strcmp(sec->name, ".llvm_bb_addr_map") ||
!strcmp(sec->name, "__tracepoints") ||
- strstr(sec->name, "__patchable_function_entries"))
+ !strcmp(sec->name, ".return_sites") ||
+ !strcmp(sec->name, ".call_sites") ||
+ !strcmp(sec->name, "__patchable_function_entries"))
continue;
for_each_reloc(sec->rsec, reloc)
@@ -4598,87 +4843,6 @@ static int validate_reachable_instructions(struct objtool_file *file)
return warnings;
}
-/* 'funcs' is a space-separated list of function names */
-static void disas_funcs(const char *funcs)
-{
- const char *objdump_str, *cross_compile;
- int size, ret;
- char *cmd;
-
- cross_compile = getenv("CROSS_COMPILE");
- if (!cross_compile)
- cross_compile = "";
-
- objdump_str = "%sobjdump -wdr %s | gawk -M -v _funcs='%s' '"
- "BEGIN { split(_funcs, funcs); }"
- "/^$/ { func_match = 0; }"
- "/<.*>:/ { "
- "f = gensub(/.*<(.*)>:/, \"\\\\1\", 1);"
- "for (i in funcs) {"
- "if (funcs[i] == f) {"
- "func_match = 1;"
- "base = strtonum(\"0x\" $1);"
- "break;"
- "}"
- "}"
- "}"
- "{"
- "if (func_match) {"
- "addr = strtonum(\"0x\" $1);"
- "printf(\"%%04x \", addr - base);"
- "print;"
- "}"
- "}' 1>&2";
-
- /* fake snprintf() to calculate the size */
- size = snprintf(NULL, 0, objdump_str, cross_compile, objname, funcs) + 1;
- if (size <= 0) {
- WARN("objdump string size calculation failed");
- return;
- }
-
- cmd = malloc(size);
-
- /* real snprintf() */
- snprintf(cmd, size, objdump_str, cross_compile, objname, funcs);
- ret = system(cmd);
- if (ret) {
- WARN("disassembly failed: %d", ret);
- return;
- }
-}
-
-static void disas_warned_funcs(struct objtool_file *file)
-{
- struct symbol *sym;
- char *funcs = NULL, *tmp;
-
- for_each_sym(file, sym) {
- if (sym->warned) {
- if (!funcs) {
- funcs = malloc(strlen(sym->name) + 1);
- if (!funcs) {
- ERROR_GLIBC("malloc");
- return;
- }
- strcpy(funcs, sym->name);
- } else {
- tmp = malloc(strlen(funcs) + strlen(sym->name) + 2);
- if (!tmp) {
- ERROR_GLIBC("malloc");
- return;
- }
- sprintf(tmp, "%s %s", funcs, sym->name);
- free(funcs);
- funcs = tmp;
- }
- }
- }
-
- if (funcs)
- disas_funcs(funcs);
-}
-
__weak bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc)
{
unsigned int type = reloc_type(reloc);
@@ -4693,7 +4857,7 @@ static int check_abs_references(struct objtool_file *file)
struct reloc *reloc;
int ret = 0;
- for_each_sec(file, sec) {
+ for_each_sec(file->elf, sec) {
/* absolute references in non-loadable sections are fine */
if (!(sec->sh.sh_flags & SHF_ALLOC))
continue;
@@ -4748,10 +4912,35 @@ static void free_insns(struct objtool_file *file)
free(chunk->addr);
}
+const char *objtool_disas_insn(struct instruction *insn)
+{
+ struct disas_context *dctx = objtool_disas_ctx;
+
+ if (!dctx)
+ return "";
+
+ disas_insn(dctx, insn);
+ return disas_result(dctx);
+}
+
int check(struct objtool_file *file)
{
+ struct disas_context *disas_ctx = NULL;
int ret = 0, warnings = 0;
+ /*
+ * Create a disassembly context if we might disassemble any
+ * instruction or function.
+ */
+ if (opts.verbose || opts.backtrace || opts.trace || opts.disas) {
+ disas_ctx = disas_context_create(file);
+ if (!disas_ctx) {
+ opts.disas = false;
+ opts.trace = false;
+ }
+ objtool_disas_ctx = disas_ctx;
+ }
+
arch_initial_func_cfi_state(&initial_func_cfi);
init_cfi_state(&init_cfi);
init_cfi_state(&func_cfi);
@@ -4767,6 +4956,10 @@ int check(struct objtool_file *file)
cfi_hash_add(&init_cfi);
cfi_hash_add(&func_cfi);
+ ret = checksum_debug_init(file);
+ if (ret)
+ goto out;
+
ret = decode_sections(file);
if (ret)
goto out;
@@ -4777,7 +4970,7 @@ int check(struct objtool_file *file)
if (opts.retpoline)
warnings += validate_retpoline(file);
- if (opts.stackval || opts.orc || opts.uaccess) {
+ if (validate_branch_enabled()) {
int w = 0;
w += validate_functions(file);
@@ -4842,7 +5035,7 @@ int check(struct objtool_file *file)
}
if (opts.prefix) {
- ret = add_prefix_symbols(file);
+ ret = create_prefix_symbols(file);
if (ret)
goto out;
}
@@ -4856,14 +5049,18 @@ int check(struct objtool_file *file)
if (opts.noabs)
warnings += check_abs_references(file);
+ if (opts.checksum) {
+ ret = create_sym_checksum_section(file);
+ if (ret)
+ goto out;
+ }
+
if (opts.orc && nr_insns) {
ret = orc_create(file);
if (ret)
goto out;
}
- free_insns(file);
-
if (opts.stats) {
printf("nr_insns_visited: %ld\n", nr_insns_visited);
printf("nr_cfi: %ld\n", nr_cfi);
@@ -4872,18 +5069,32 @@ int check(struct objtool_file *file)
}
out:
- if (!ret && !warnings)
- return 0;
+ if (ret || warnings) {
+ if (opts.werror && warnings)
+ ret = 1;
+
+ if (opts.verbose) {
+ if (opts.werror && warnings)
+ WARN("%d warning(s) upgraded to errors", warnings);
+ disas_warned_funcs(disas_ctx);
+ }
+ }
- if (opts.werror && warnings)
- ret = 1;
+ if (opts.disas)
+ disas_funcs(disas_ctx);
- if (opts.verbose) {
- if (opts.werror && warnings)
- WARN("%d warning(s) upgraded to errors", warnings);
- print_args();
- disas_warned_funcs(file);
+ if (disas_ctx) {
+ disas_context_destroy(disas_ctx);
+ objtool_disas_ctx = NULL;
}
+ free_insns(file);
+
+ if (!ret && !warnings)
+ return 0;
+
+ if (opts.backup && make_backup())
+ return 1;
+
return ret;
}
diff --git a/tools/objtool/disas.c b/tools/objtool/disas.c
new file mode 100644
index 000000000000..2b5059f55e40
--- /dev/null
+++ b/tools/objtool/disas.c
@@ -0,0 +1,1248 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2015-2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ */
+
+#define _GNU_SOURCE
+#include <fnmatch.h>
+
+#include <objtool/arch.h>
+#include <objtool/check.h>
+#include <objtool/disas.h>
+#include <objtool/special.h>
+#include <objtool/warn.h>
+
+#include <bfd.h>
+#include <linux/string.h>
+#include <tools/dis-asm-compat.h>
+
+/*
+ * Size of the buffer for storing the result of disassembling
+ * a single instruction.
+ */
+#define DISAS_RESULT_SIZE 1024
+
+struct disas_context {
+ struct objtool_file *file;
+ struct instruction *insn;
+ bool alt_applied;
+ char result[DISAS_RESULT_SIZE];
+ disassembler_ftype disassembler;
+ struct disassemble_info info;
+};
+
+/*
+ * Maximum number of alternatives
+ */
+#define DISAS_ALT_MAX 5
+
+/*
+ * Maximum number of instructions per alternative
+ */
+#define DISAS_ALT_INSN_MAX 50
+
+/*
+ * Information to disassemble an alternative
+ */
+struct disas_alt {
+ struct instruction *orig_insn; /* original instruction */
+ struct alternative *alt; /* alternative or NULL if default code */
+ char *name; /* name for this alternative */
+ int width; /* formatting width */
+ struct {
+ char *str; /* instruction string */
+ int offset; /* instruction offset */
+ int nops; /* number of nops */
+ } insn[DISAS_ALT_INSN_MAX]; /* alternative instructions */
+ int insn_idx; /* index of the next instruction to print */
+};
+
+#define DALT_DEFAULT(dalt) (!(dalt)->alt)
+#define DALT_INSN(dalt) (DALT_DEFAULT(dalt) ? (dalt)->orig_insn : (dalt)->alt->insn)
+#define DALT_GROUP(dalt) (DALT_INSN(dalt)->alt_group)
+#define DALT_ALTID(dalt) ((dalt)->orig_insn->offset)
+
+#define ALT_FLAGS_SHIFT 16
+#define ALT_FLAG_NOT (1 << 0)
+#define ALT_FLAG_DIRECT_CALL (1 << 1)
+#define ALT_FEATURE_MASK ((1 << ALT_FLAGS_SHIFT) - 1)
+
+static int alt_feature(unsigned int ft_flags)
+{
+ return (ft_flags & ALT_FEATURE_MASK);
+}
+
+static int alt_flags(unsigned int ft_flags)
+{
+ return (ft_flags >> ALT_FLAGS_SHIFT);
+}
+
+/*
+ * Wrapper around asprintf() to allocate and format a string.
+ * Return the allocated string or NULL on error.
+ */
+static char *strfmt(const char *fmt, ...)
+{
+ va_list ap;
+ char *str;
+ int rv;
+
+ va_start(ap, fmt);
+ rv = vasprintf(&str, fmt, ap);
+ va_end(ap);
+
+ return rv == -1 ? NULL : str;
+}
+
+static int sprint_name(char *str, const char *name, unsigned long offset)
+{
+ int len;
+
+ if (offset)
+ len = sprintf(str, "%s+0x%lx", name, offset);
+ else
+ len = sprintf(str, "%s", name);
+
+ return len;
+}
+
+#define DINFO_FPRINTF(dinfo, ...) \
+ ((*(dinfo)->fprintf_func)((dinfo)->stream, __VA_ARGS__))
+
+static int disas_result_fprintf(struct disas_context *dctx,
+ const char *fmt, va_list ap)
+{
+ char *buf = dctx->result;
+ int avail, len;
+
+ len = strlen(buf);
+ if (len >= DISAS_RESULT_SIZE - 1) {
+ WARN_FUNC(dctx->insn->sec, dctx->insn->offset,
+ "disassembly buffer is full");
+ return -1;
+ }
+ avail = DISAS_RESULT_SIZE - len;
+
+ len = vsnprintf(buf + len, avail, fmt, ap);
+ if (len < 0 || len >= avail) {
+ WARN_FUNC(dctx->insn->sec, dctx->insn->offset,
+ "disassembly buffer is truncated");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int disas_fprintf(void *stream, const char *fmt, ...)
+{
+ va_list arg;
+ int rv;
+
+ va_start(arg, fmt);
+ rv = disas_result_fprintf(stream, fmt, arg);
+ va_end(arg);
+
+ return rv;
+}
+
+/*
+ * For init_disassemble_info_compat().
+ */
+static int disas_fprintf_styled(void *stream,
+ enum disassembler_style style,
+ const char *fmt, ...)
+{
+ va_list arg;
+ int rv;
+
+ va_start(arg, fmt);
+ rv = disas_result_fprintf(stream, fmt, arg);
+ va_end(arg);
+
+ return rv;
+}
+
+static void disas_print_addr_sym(struct section *sec, struct symbol *sym,
+ bfd_vma addr, struct disassemble_info *dinfo)
+{
+ char symstr[1024];
+ char *str;
+
+ if (sym) {
+ sprint_name(symstr, sym->name, addr - sym->offset);
+ DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr);
+ } else {
+ str = offstr(sec, addr);
+ DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str);
+ free(str);
+ }
+}
+
+static bool disas_print_addr_alt(bfd_vma addr, struct disassemble_info *dinfo)
+{
+ struct disas_context *dctx = dinfo->application_data;
+ struct instruction *orig_first_insn;
+ struct alt_group *alt_group;
+ unsigned long offset;
+ struct symbol *sym;
+
+ /*
+ * Check if we are processing an alternative at the original
+ * instruction address (i.e. if alt_applied is true) and if
+ * we are referencing an address inside the alternative.
+ *
+ * For example, this happens if there is a branch inside an
+ * alternative. In that case, the address should be updated
+ * to a reference inside the original instruction flow.
+ */
+ if (!dctx->alt_applied)
+ return false;
+
+ alt_group = dctx->insn->alt_group;
+ if (!alt_group || !alt_group->orig_group ||
+ addr < alt_group->first_insn->offset ||
+ addr > alt_group->last_insn->offset)
+ return false;
+
+ orig_first_insn = alt_group->orig_group->first_insn;
+ offset = addr - alt_group->first_insn->offset;
+
+ addr = orig_first_insn->offset + offset;
+ sym = orig_first_insn->sym;
+
+ disas_print_addr_sym(orig_first_insn->sec, sym, addr, dinfo);
+
+ return true;
+}
+
+static void disas_print_addr_noreloc(bfd_vma addr,
+ struct disassemble_info *dinfo)
+{
+ struct disas_context *dctx = dinfo->application_data;
+ struct instruction *insn = dctx->insn;
+ struct symbol *sym = NULL;
+
+ if (disas_print_addr_alt(addr, dinfo))
+ return;
+
+ if (insn->sym && addr >= insn->sym->offset &&
+ addr < insn->sym->offset + insn->sym->len) {
+ sym = insn->sym;
+ }
+
+ disas_print_addr_sym(insn->sec, sym, addr, dinfo);
+}
+
+static void disas_print_addr_reloc(bfd_vma addr, struct disassemble_info *dinfo)
+{
+ struct disas_context *dctx = dinfo->application_data;
+ struct instruction *insn = dctx->insn;
+ unsigned long offset;
+ struct reloc *reloc;
+ char symstr[1024];
+ char *str;
+
+ reloc = find_reloc_by_dest_range(dctx->file->elf, insn->sec,
+ insn->offset, insn->len);
+ if (!reloc) {
+ /*
+ * There is no relocation for this instruction although
+ * the address to resolve points to the next instruction.
+ * So this is an effective reference to the next IP, for
+ * example: "lea 0x0(%rip),%rdi". The kernel can reference
+ * the next IP with _THIS_IP_ macro.
+ */
+ DINFO_FPRINTF(dinfo, "0x%lx <_THIS_IP_>", addr);
+ return;
+ }
+
+ offset = arch_insn_adjusted_addend(insn, reloc);
+
+ /*
+ * If the relocation symbol is a section name (for example ".bss")
+ * then we try to further resolve the name.
+ */
+ if (reloc->sym->type == STT_SECTION) {
+ str = offstr(reloc->sym->sec, reloc->sym->offset + offset);
+ DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str);
+ free(str);
+ } else {
+ sprint_name(symstr, reloc->sym->name, offset);
+ DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr);
+ }
+}
+
+/*
+ * Resolve an address into a "<symbol>+<offset>" string.
+ */
+static void disas_print_address(bfd_vma addr, struct disassemble_info *dinfo)
+{
+ struct disas_context *dctx = dinfo->application_data;
+ struct instruction *insn = dctx->insn;
+ struct instruction *jump_dest;
+ struct symbol *sym;
+ bool is_reloc;
+
+ /*
+ * If the instruction is a call/jump and it references a
+ * destination then this is likely the address we are looking
+ * up. So check it first.
+ */
+ jump_dest = insn->jump_dest;
+ if (jump_dest && jump_dest->sym && jump_dest->offset == addr) {
+ if (!disas_print_addr_alt(addr, dinfo))
+ disas_print_addr_sym(jump_dest->sec, jump_dest->sym,
+ addr, dinfo);
+ return;
+ }
+
+ /*
+ * If the address points to the next instruction then there is
+ * probably a relocation. It can be a false positive when the
+ * current instruction is referencing the address of the next
+ * instruction. This particular case will be handled in
+ * disas_print_addr_reloc().
+ */
+ is_reloc = (addr == insn->offset + insn->len);
+
+ /*
+ * The call destination offset can be the address we are looking
+ * up, or 0 if there is a relocation.
+ */
+ sym = insn_call_dest(insn);
+ if (sym && (sym->offset == addr || (sym->offset == 0 && is_reloc))) {
+ DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, sym->name);
+ return;
+ }
+
+ if (!is_reloc)
+ disas_print_addr_noreloc(addr, dinfo);
+ else
+ disas_print_addr_reloc(addr, dinfo);
+}
+
+/*
+ * Initialize disassemble info arch, mach (32 or 64-bit) and options.
+ */
+int disas_info_init(struct disassemble_info *dinfo,
+ int arch, int mach32, int mach64,
+ const char *options)
+{
+ struct disas_context *dctx = dinfo->application_data;
+ struct objtool_file *file = dctx->file;
+
+ dinfo->arch = arch;
+
+ switch (file->elf->ehdr.e_ident[EI_CLASS]) {
+ case ELFCLASS32:
+ dinfo->mach = mach32;
+ break;
+ case ELFCLASS64:
+ dinfo->mach = mach64;
+ break;
+ default:
+ return -1;
+ }
+
+ dinfo->disassembler_options = options;
+
+ return 0;
+}
+
+struct disas_context *disas_context_create(struct objtool_file *file)
+{
+ struct disas_context *dctx;
+ struct disassemble_info *dinfo;
+ int err;
+
+ dctx = malloc(sizeof(*dctx));
+ if (!dctx) {
+ WARN("failed to allocate disassembly context");
+ return NULL;
+ }
+
+ dctx->file = file;
+ dinfo = &dctx->info;
+
+ init_disassemble_info_compat(dinfo, dctx,
+ disas_fprintf, disas_fprintf_styled);
+
+ dinfo->read_memory_func = buffer_read_memory;
+ dinfo->print_address_func = disas_print_address;
+ dinfo->application_data = dctx;
+
+ /*
+ * bfd_openr() is not used to avoid doing ELF data processing
+ * and caching that has already being done. Here, we just need
+ * to identify the target file so we call an arch specific
+ * function to fill some disassemble info (arch, mach).
+ */
+
+ dinfo->arch = bfd_arch_unknown;
+ dinfo->mach = 0;
+
+ err = arch_disas_info_init(dinfo);
+ if (err || dinfo->arch == bfd_arch_unknown || dinfo->mach == 0) {
+ WARN("failed to init disassembly arch");
+ goto error;
+ }
+
+ dinfo->endian = (file->elf->ehdr.e_ident[EI_DATA] == ELFDATA2MSB) ?
+ BFD_ENDIAN_BIG : BFD_ENDIAN_LITTLE;
+
+ disassemble_init_for_target(dinfo);
+
+ dctx->disassembler = disassembler(dinfo->arch,
+ dinfo->endian == BFD_ENDIAN_BIG,
+ dinfo->mach, NULL);
+ if (!dctx->disassembler) {
+ WARN("failed to create disassembler function");
+ goto error;
+ }
+
+ return dctx;
+
+error:
+ free(dctx);
+ return NULL;
+}
+
+void disas_context_destroy(struct disas_context *dctx)
+{
+ free(dctx);
+}
+
+char *disas_result(struct disas_context *dctx)
+{
+ return dctx->result;
+}
+
+#define DISAS_INSN_OFFSET_SPACE 10
+#define DISAS_INSN_SPACE 60
+
+#define DISAS_PRINSN(dctx, insn, depth) \
+ disas_print_insn(stdout, dctx, insn, depth, "\n")
+
+/*
+ * Print a message in the instruction flow. If sec is not NULL then the
+ * address at the section offset is printed in addition of the message,
+ * otherwise only the message is printed.
+ */
+static int disas_vprint(FILE *stream, struct section *sec, unsigned long offset,
+ int depth, const char *format, va_list ap)
+{
+ const char *addr_str;
+ int i, n;
+ int len;
+
+ len = sym_name_max_len + DISAS_INSN_OFFSET_SPACE;
+ if (depth < 0) {
+ len += depth;
+ depth = 0;
+ }
+
+ n = 0;
+
+ if (sec) {
+ addr_str = offstr(sec, offset);
+ n += fprintf(stream, "%6lx: %-*s ", offset, len, addr_str);
+ free((char *)addr_str);
+ } else {
+ len += DISAS_INSN_OFFSET_SPACE + 1;
+ n += fprintf(stream, "%-*s", len, "");
+ }
+
+ /* print vertical bars to show the code flow */
+ for (i = 0; i < depth; i++)
+ n += fprintf(stream, "| ");
+
+ if (format)
+ n += vfprintf(stream, format, ap);
+
+ return n;
+}
+
+static int disas_print(FILE *stream, struct section *sec, unsigned long offset,
+ int depth, const char *format, ...)
+{
+ va_list args;
+ int len;
+
+ va_start(args, format);
+ len = disas_vprint(stream, sec, offset, depth, format, args);
+ va_end(args);
+
+ return len;
+}
+
+/*
+ * Print a message in the instruction flow. If insn is not NULL then
+ * the instruction address is printed in addition of the message,
+ * otherwise only the message is printed. In all cases, the instruction
+ * itself is not printed.
+ */
+void disas_print_info(FILE *stream, struct instruction *insn, int depth,
+ const char *format, ...)
+{
+ struct section *sec;
+ unsigned long off;
+ va_list args;
+
+ if (insn) {
+ sec = insn->sec;
+ off = insn->offset;
+ } else {
+ sec = NULL;
+ off = 0;
+ }
+
+ va_start(args, format);
+ disas_vprint(stream, sec, off, depth, format, args);
+ va_end(args);
+}
+
+/*
+ * Print an instruction address (offset and function), the instruction itself
+ * and an optional message.
+ */
+void disas_print_insn(FILE *stream, struct disas_context *dctx,
+ struct instruction *insn, int depth,
+ const char *format, ...)
+{
+ char fake_nop_insn[32];
+ const char *insn_str;
+ bool fake_nop;
+ va_list args;
+ int len;
+
+ /*
+ * Alternative can insert a fake nop, sometimes with no
+ * associated section so nothing to disassemble.
+ */
+ fake_nop = (!insn->sec && insn->type == INSN_NOP);
+ if (fake_nop) {
+ snprintf(fake_nop_insn, 32, "<fake nop> (%d bytes)", insn->len);
+ insn_str = fake_nop_insn;
+ } else {
+ disas_insn(dctx, insn);
+ insn_str = disas_result(dctx);
+ }
+
+ /* print the instruction */
+ len = (depth + 1) * 2 < DISAS_INSN_SPACE ? DISAS_INSN_SPACE - (depth+1) * 2 : 1;
+ disas_print_info(stream, insn, depth, "%-*s", len, insn_str);
+
+ /* print message if any */
+ if (!format)
+ return;
+
+ if (strcmp(format, "\n") == 0) {
+ fprintf(stream, "\n");
+ return;
+ }
+
+ fprintf(stream, " - ");
+ va_start(args, format);
+ vfprintf(stream, format, args);
+ va_end(args);
+}
+
+/*
+ * Disassemble a single instruction. Return the size of the instruction.
+ *
+ * If alt_applied is true then insn should be an instruction from of an
+ * alternative (i.e. insn->alt_group != NULL), and it is disassembled
+ * at the location of the original code it is replacing. When the
+ * instruction references any address inside the alternative then
+ * these references will be re-adjusted to replace the original code.
+ */
+static size_t disas_insn_common(struct disas_context *dctx,
+ struct instruction *insn,
+ bool alt_applied)
+{
+ disassembler_ftype disasm = dctx->disassembler;
+ struct disassemble_info *dinfo = &dctx->info;
+
+ dctx->insn = insn;
+ dctx->alt_applied = alt_applied;
+ dctx->result[0] = '\0';
+
+ if (insn->type == INSN_NOP) {
+ DINFO_FPRINTF(dinfo, "nop%d", insn->len);
+ return insn->len;
+ }
+
+ /*
+ * Set the disassembler buffer to read data from the section
+ * containing the instruction to disassemble.
+ */
+ dinfo->buffer = insn->sec->data->d_buf;
+ dinfo->buffer_vma = 0;
+ dinfo->buffer_length = insn->sec->sh.sh_size;
+
+ return disasm(insn->offset, &dctx->info);
+}
+
+size_t disas_insn(struct disas_context *dctx, struct instruction *insn)
+{
+ return disas_insn_common(dctx, insn, false);
+}
+
+static size_t disas_insn_alt(struct disas_context *dctx,
+ struct instruction *insn)
+{
+ return disas_insn_common(dctx, insn, true);
+}
+
+static struct instruction *next_insn_same_alt(struct objtool_file *file,
+ struct alt_group *alt_grp,
+ struct instruction *insn)
+{
+ if (alt_grp->last_insn == insn || alt_grp->nop == insn)
+ return NULL;
+
+ return next_insn_same_sec(file, insn);
+}
+
+#define alt_for_each_insn(file, alt_grp, insn) \
+ for (insn = alt_grp->first_insn; \
+ insn; \
+ insn = next_insn_same_alt(file, alt_grp, insn))
+
+/*
+ * Provide a name for the type of alternatives present at the
+ * specified instruction.
+ *
+ * An instruction can have alternatives with different types, for
+ * example alternative instructions and an exception table. In that
+ * case the name for the alternative instructions type is used.
+ *
+ * Return NULL if the instruction as no alternative.
+ */
+const char *disas_alt_type_name(struct instruction *insn)
+{
+ struct alternative *alt;
+ const char *name;
+
+ name = NULL;
+ for (alt = insn->alts; alt; alt = alt->next) {
+ if (alt->type == ALT_TYPE_INSTRUCTIONS) {
+ name = "alternative";
+ break;
+ }
+
+ switch (alt->type) {
+ case ALT_TYPE_EX_TABLE:
+ name = "ex_table";
+ break;
+ case ALT_TYPE_JUMP_TABLE:
+ name = "jump_table";
+ break;
+ default:
+ name = "unknown";
+ break;
+ }
+ }
+
+ return name;
+}
+
+/*
+ * Provide a name for an alternative.
+ */
+char *disas_alt_name(struct alternative *alt)
+{
+ char pfx[4] = { 0 };
+ char *str = NULL;
+ const char *name;
+ int feature;
+ int flags;
+ int num;
+
+ switch (alt->type) {
+
+ case ALT_TYPE_EX_TABLE:
+ str = strdup("EXCEPTION");
+ break;
+
+ case ALT_TYPE_JUMP_TABLE:
+ str = strdup("JUMP");
+ break;
+
+ case ALT_TYPE_INSTRUCTIONS:
+ /*
+ * This is a non-default group alternative. Create a name
+ * based on the feature and flags associated with this
+ * alternative. Use either the feature name (it is available)
+ * or the feature number. And add a prefix to show the flags
+ * used.
+ *
+ * Prefix flags characters:
+ *
+ * '!' alternative used when feature not enabled
+ * '+' direct call alternative
+ * '?' unknown flag
+ */
+
+ if (!alt->insn->alt_group)
+ return NULL;
+
+ feature = alt->insn->alt_group->feature;
+ num = alt_feature(feature);
+ flags = alt_flags(feature);
+ str = pfx;
+
+ if (flags & ~(ALT_FLAG_NOT | ALT_FLAG_DIRECT_CALL))
+ *str++ = '?';
+ if (flags & ALT_FLAG_DIRECT_CALL)
+ *str++ = '+';
+ if (flags & ALT_FLAG_NOT)
+ *str++ = '!';
+
+ name = arch_cpu_feature_name(num);
+ if (!name)
+ str = strfmt("%sFEATURE 0x%X", pfx, num);
+ else
+ str = strfmt("%s%s", pfx, name);
+
+ break;
+ }
+
+ return str;
+}
+
+/*
+ * Initialize an alternative. The default alternative should be initialized
+ * with alt=NULL.
+ */
+static int disas_alt_init(struct disas_alt *dalt,
+ struct instruction *orig_insn,
+ struct alternative *alt)
+{
+ dalt->orig_insn = orig_insn;
+ dalt->alt = alt;
+ dalt->insn_idx = 0;
+ dalt->name = alt ? disas_alt_name(alt) : strdup("DEFAULT");
+ if (!dalt->name)
+ return -1;
+ dalt->width = strlen(dalt->name);
+
+ return 0;
+}
+
+static int disas_alt_add_insn(struct disas_alt *dalt, int index, char *insn_str,
+ int offset, int nops)
+{
+ int len;
+
+ if (index >= DISAS_ALT_INSN_MAX) {
+ WARN("Alternative %lx.%s has more instructions than supported",
+ DALT_ALTID(dalt), dalt->name);
+ return -1;
+ }
+
+ len = strlen(insn_str);
+ dalt->insn[index].str = insn_str;
+ dalt->insn[index].offset = offset;
+ dalt->insn[index].nops = nops;
+ if (len > dalt->width)
+ dalt->width = len;
+
+ return 0;
+}
+
+static int disas_alt_jump(struct disas_alt *dalt)
+{
+ struct instruction *orig_insn;
+ struct instruction *dest_insn;
+ char suffix[2] = { 0 };
+ char *str;
+ int nops;
+
+ orig_insn = dalt->orig_insn;
+ dest_insn = dalt->alt->insn;
+
+ if (orig_insn->type == INSN_NOP) {
+ if (orig_insn->len == 5)
+ suffix[0] = 'q';
+ str = strfmt("jmp%-3s %lx <%s+0x%lx>", suffix,
+ dest_insn->offset, dest_insn->sym->name,
+ dest_insn->offset - dest_insn->sym->offset);
+ nops = 0;
+ } else {
+ str = strfmt("nop%d", orig_insn->len);
+ nops = orig_insn->len;
+ }
+
+ if (!str)
+ return -1;
+
+ disas_alt_add_insn(dalt, 0, str, 0, nops);
+
+ return 1;
+}
+
+/*
+ * Disassemble an exception table alternative.
+ */
+static int disas_alt_extable(struct disas_alt *dalt)
+{
+ struct instruction *alt_insn;
+ char *str;
+
+ alt_insn = dalt->alt->insn;
+ str = strfmt("resume at 0x%lx <%s+0x%lx>",
+ alt_insn->offset, alt_insn->sym->name,
+ alt_insn->offset - alt_insn->sym->offset);
+ if (!str)
+ return -1;
+
+ disas_alt_add_insn(dalt, 0, str, 0, 0);
+
+ return 1;
+}
+
+/*
+ * Disassemble an alternative and store instructions in the disas_alt
+ * structure. Return the number of instructions in the alternative.
+ */
+static int disas_alt_group(struct disas_context *dctx, struct disas_alt *dalt)
+{
+ struct objtool_file *file;
+ struct instruction *insn;
+ int offset;
+ char *str;
+ int count;
+ int nops;
+ int err;
+
+ file = dctx->file;
+ count = 0;
+ offset = 0;
+ nops = 0;
+
+ alt_for_each_insn(file, DALT_GROUP(dalt), insn) {
+
+ disas_insn_alt(dctx, insn);
+ str = strdup(disas_result(dctx));
+ if (!str)
+ return -1;
+
+ nops = insn->type == INSN_NOP ? insn->len : 0;
+ err = disas_alt_add_insn(dalt, count, str, offset, nops);
+ if (err)
+ break;
+ offset += insn->len;
+ count++;
+ }
+
+ return count;
+}
+
+/*
+ * Disassemble the default alternative.
+ */
+static int disas_alt_default(struct disas_context *dctx, struct disas_alt *dalt)
+{
+ char *str;
+ int nops;
+ int err;
+
+ if (DALT_GROUP(dalt))
+ return disas_alt_group(dctx, dalt);
+
+ /*
+ * Default alternative with no alt_group: this is the default
+ * code associated with either a jump table or an exception
+ * table and no other instruction alternatives. In that case
+ * the default alternative is made of a single instruction.
+ */
+ disas_insn(dctx, dalt->orig_insn);
+ str = strdup(disas_result(dctx));
+ if (!str)
+ return -1;
+ nops = dalt->orig_insn->type == INSN_NOP ? dalt->orig_insn->len : 0;
+ err = disas_alt_add_insn(dalt, 0, str, 0, nops);
+ if (err)
+ return -1;
+
+ return 1;
+}
+
+/*
+ * For each alternative, if there is an instruction at the specified
+ * offset then print this instruction, otherwise print a blank entry.
+ * The offset is an offset from the start of the alternative.
+ *
+ * Return the offset for the next instructions to print, or -1 if all
+ * instructions have been printed.
+ */
+static int disas_alt_print_insn(struct disas_alt *dalts, int alt_count,
+ int insn_count, int offset)
+{
+ struct disas_alt *dalt;
+ int offset_next;
+ char *str;
+ int i, j;
+
+ offset_next = -1;
+
+ for (i = 0; i < alt_count; i++) {
+ dalt = &dalts[i];
+ j = dalt->insn_idx;
+ if (j == -1) {
+ printf("| %-*s ", dalt->width, "");
+ continue;
+ }
+
+ if (dalt->insn[j].offset == offset) {
+ str = dalt->insn[j].str;
+ printf("| %-*s ", dalt->width, str ?: "");
+ if (++j < insn_count) {
+ dalt->insn_idx = j;
+ } else {
+ dalt->insn_idx = -1;
+ continue;
+ }
+ } else {
+ printf("| %-*s ", dalt->width, "");
+ }
+
+ if (dalt->insn[j].offset > 0 &&
+ (offset_next == -1 ||
+ (dalt->insn[j].offset < offset_next)))
+ offset_next = dalt->insn[j].offset;
+ }
+ printf("\n");
+
+ return offset_next;
+}
+
+/*
+ * Print all alternatives side-by-side.
+ */
+static void disas_alt_print_wide(char *alt_name, struct disas_alt *dalts, int alt_count,
+ int insn_count)
+{
+ struct instruction *orig_insn;
+ int offset_next;
+ int offset;
+ int i;
+
+ orig_insn = dalts[0].orig_insn;
+
+ /*
+ * Print an header with the name of each alternative.
+ */
+ disas_print_info(stdout, orig_insn, -2, NULL);
+
+ if (strlen(alt_name) > dalts[0].width)
+ dalts[0].width = strlen(alt_name);
+ printf("| %-*s ", dalts[0].width, alt_name);
+
+ for (i = 1; i < alt_count; i++)
+ printf("| %-*s ", dalts[i].width, dalts[i].name);
+
+ printf("\n");
+
+ /*
+ * Print instructions for each alternative.
+ */
+ offset_next = 0;
+ do {
+ offset = offset_next;
+ disas_print(stdout, orig_insn->sec, orig_insn->offset + offset,
+ -2, NULL);
+ offset_next = disas_alt_print_insn(dalts, alt_count, insn_count,
+ offset);
+ } while (offset_next > offset);
+}
+
+/*
+ * Print all alternatives one above the other.
+ */
+static void disas_alt_print_compact(char *alt_name, struct disas_alt *dalts,
+ int alt_count, int insn_count)
+{
+ struct instruction *orig_insn;
+ int width;
+ int i, j;
+ int len;
+
+ orig_insn = dalts[0].orig_insn;
+
+ len = disas_print(stdout, orig_insn->sec, orig_insn->offset, 0, NULL);
+ printf("%s\n", alt_name);
+
+ /*
+ * If all alternatives have a single instruction then print each
+ * alternative on a single line. Otherwise, print alternatives
+ * one above the other with a clear separation.
+ */
+
+ if (insn_count == 1) {
+ width = 0;
+ for (i = 0; i < alt_count; i++) {
+ if (dalts[i].width > width)
+ width = dalts[i].width;
+ }
+
+ for (i = 0; i < alt_count; i++) {
+ printf("%*s= %-*s (if %s)\n", len, "", width,
+ dalts[i].insn[0].str, dalts[i].name);
+ }
+
+ return;
+ }
+
+ for (i = 0; i < alt_count; i++) {
+ printf("%*s= %s\n", len, "", dalts[i].name);
+ for (j = 0; j < insn_count; j++) {
+ if (!dalts[i].insn[j].str)
+ break;
+ disas_print(stdout, orig_insn->sec,
+ orig_insn->offset + dalts[i].insn[j].offset, 0,
+ "| %s\n", dalts[i].insn[j].str);
+ }
+ printf("%*s|\n", len, "");
+ }
+}
+
+/*
+ * Trim NOPs in alternatives. This replaces trailing NOPs in alternatives
+ * with a single indication of the number of bytes covered with NOPs.
+ *
+ * Return the maximum numbers of instructions in all alternatives after
+ * trailing NOPs have been trimmed.
+ */
+static int disas_alt_trim_nops(struct disas_alt *dalts, int alt_count,
+ int insn_count)
+{
+ struct disas_alt *dalt;
+ int nops_count;
+ const char *s;
+ int offset;
+ int count;
+ int nops;
+ int i, j;
+
+ count = 0;
+ for (i = 0; i < alt_count; i++) {
+ offset = 0;
+ nops = 0;
+ nops_count = 0;
+ dalt = &dalts[i];
+ for (j = insn_count - 1; j >= 0; j--) {
+ if (!dalt->insn[j].str || !dalt->insn[j].nops)
+ break;
+ offset = dalt->insn[j].offset;
+ free(dalt->insn[j].str);
+ dalt->insn[j].offset = 0;
+ dalt->insn[j].str = NULL;
+ nops += dalt->insn[j].nops;
+ nops_count++;
+ }
+
+ /*
+ * All trailing NOPs have been removed. If there was a single
+ * NOP instruction then re-add it. If there was a block of
+ * NOPs then indicate the number of bytes than the block
+ * covers (nop*<number-of-bytes>).
+ */
+ if (nops_count) {
+ s = nops_count == 1 ? "" : "*";
+ dalt->insn[j + 1].str = strfmt("nop%s%d", s, nops);
+ dalt->insn[j + 1].offset = offset;
+ dalt->insn[j + 1].nops = nops;
+ j++;
+ }
+
+ if (j > count)
+ count = j;
+ }
+
+ return count + 1;
+}
+
+/*
+ * Disassemble an alternative.
+ *
+ * Return the last instruction in the default alternative so that
+ * disassembly can continue with the next instruction. Return NULL
+ * on error.
+ */
+static void *disas_alt(struct disas_context *dctx,
+ struct instruction *orig_insn)
+{
+ struct disas_alt dalts[DISAS_ALT_MAX] = { 0 };
+ struct instruction *last_insn = NULL;
+ struct alternative *alt;
+ struct disas_alt *dalt;
+ int insn_count = 0;
+ int alt_count = 0;
+ char *alt_name;
+ int count;
+ int i, j;
+ int err;
+
+ alt_name = strfmt("<%s.%lx>", disas_alt_type_name(orig_insn),
+ orig_insn->offset);
+ if (!alt_name) {
+ WARN("Failed to define name for alternative at instruction 0x%lx",
+ orig_insn->offset);
+ goto done;
+ }
+
+ /*
+ * Initialize and disassemble the default alternative.
+ */
+ err = disas_alt_init(&dalts[0], orig_insn, NULL);
+ if (err) {
+ WARN("%s: failed to initialize default alternative", alt_name);
+ goto done;
+ }
+
+ insn_count = disas_alt_default(dctx, &dalts[0]);
+ if (insn_count < 0) {
+ WARN("%s: failed to disassemble default alternative", alt_name);
+ goto done;
+ }
+
+ /*
+ * Initialize and disassemble all other alternatives.
+ */
+ i = 1;
+ for (alt = orig_insn->alts; alt; alt = alt->next) {
+ if (i >= DISAS_ALT_MAX) {
+ WARN("%s has more alternatives than supported", alt_name);
+ break;
+ }
+
+ dalt = &dalts[i];
+ err = disas_alt_init(dalt, orig_insn, alt);
+ if (err) {
+ WARN("%s: failed to disassemble alternative", alt_name);
+ goto done;
+ }
+
+ count = -1;
+ switch (dalt->alt->type) {
+ case ALT_TYPE_INSTRUCTIONS:
+ count = disas_alt_group(dctx, dalt);
+ break;
+ case ALT_TYPE_EX_TABLE:
+ count = disas_alt_extable(dalt);
+ break;
+ case ALT_TYPE_JUMP_TABLE:
+ count = disas_alt_jump(dalt);
+ break;
+ }
+ if (count < 0) {
+ WARN("%s: failed to disassemble alternative %s",
+ alt_name, dalt->name);
+ goto done;
+ }
+
+ insn_count = count > insn_count ? count : insn_count;
+ i++;
+ }
+ alt_count = i;
+
+ /*
+ * Print default and non-default alternatives.
+ */
+
+ insn_count = disas_alt_trim_nops(dalts, alt_count, insn_count);
+
+ if (opts.wide)
+ disas_alt_print_wide(alt_name, dalts, alt_count, insn_count);
+ else
+ disas_alt_print_compact(alt_name, dalts, alt_count, insn_count);
+
+ last_insn = orig_insn->alt_group ? orig_insn->alt_group->last_insn :
+ orig_insn;
+
+done:
+ for (i = 0; i < alt_count; i++) {
+ free(dalts[i].name);
+ for (j = 0; j < insn_count; j++)
+ free(dalts[i].insn[j].str);
+ }
+
+ free(alt_name);
+
+ return last_insn;
+}
+
+/*
+ * Disassemble a function.
+ */
+static void disas_func(struct disas_context *dctx, struct symbol *func)
+{
+ struct instruction *insn_start;
+ struct instruction *insn;
+
+ printf("%s:\n", func->name);
+ sym_for_each_insn(dctx->file, func, insn) {
+ if (insn->alts) {
+ insn_start = insn;
+ insn = disas_alt(dctx, insn);
+ if (insn)
+ continue;
+ /*
+ * There was an error with disassembling
+ * the alternative. Resume disassembling
+ * at the current instruction, this will
+ * disassemble the default alternative
+ * only and continue with the code after
+ * the alternative.
+ */
+ insn = insn_start;
+ }
+
+ DISAS_PRINSN(dctx, insn, 0);
+ }
+ printf("\n");
+}
+
+/*
+ * Disassemble all warned functions.
+ */
+void disas_warned_funcs(struct disas_context *dctx)
+{
+ struct symbol *sym;
+
+ if (!dctx)
+ return;
+
+ for_each_sym(dctx->file->elf, sym) {
+ if (sym->warned)
+ disas_func(dctx, sym);
+ }
+}
+
+void disas_funcs(struct disas_context *dctx)
+{
+ bool disas_all = !strcmp(opts.disas, "*");
+ struct section *sec;
+ struct symbol *sym;
+
+ for_each_sec(dctx->file->elf, sec) {
+
+ if (!(sec->sh.sh_flags & SHF_EXECINSTR))
+ continue;
+
+ sec_for_each_sym(sec, sym) {
+ /*
+ * If the function had a warning and the verbose
+ * option is used then the function was already
+ * disassemble.
+ */
+ if (opts.verbose && sym->warned)
+ continue;
+
+ if (disas_all || fnmatch(opts.disas, sym->name, 0) == 0)
+ disas_func(dctx, sym);
+ }
+ }
+}
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index ca5d77db692a..6a8ed9c62323 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -16,12 +16,17 @@
#include <string.h>
#include <unistd.h>
#include <errno.h>
+#include <libgen.h>
+#include <ctype.h>
#include <linux/interval_tree_generic.h>
#include <objtool/builtin.h>
-
#include <objtool/elf.h>
#include <objtool/warn.h>
+#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1))
+#define ALIGN_UP_POW2(x) (1U << ((8 * sizeof(x)) - __builtin_clz((x) - 1U)))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
static inline u32 str_hash(const char *str)
{
return jhash(str, strlen(str), 0);
@@ -92,11 +97,12 @@ static inline unsigned long __sym_start(struct symbol *s)
static inline unsigned long __sym_last(struct symbol *s)
{
- return s->offset + s->len - 1;
+ return s->offset + (s->len ? s->len - 1 : 0);
}
INTERVAL_TREE_DEFINE(struct symbol, node, unsigned long, __subtree_last,
- __sym_start, __sym_last, static, __sym)
+ __sym_start, __sym_last, static inline __maybe_unused,
+ __sym)
#define __sym_for_each(_iter, _tree, _start, _end) \
for (_iter = __sym_iter_first((_tree), (_start), (_end)); \
@@ -108,7 +114,7 @@ struct symbol_hole {
};
/*
- * Find !section symbol where @offset is after it.
+ * Find the last symbol before @offset.
*/
static int symbol_hole_by_offset(const void *key, const struct rb_node *node)
{
@@ -119,8 +125,7 @@ static int symbol_hole_by_offset(const void *key, const struct rb_node *node)
return -1;
if (sh->key >= s->offset + s->len) {
- if (s->type != STT_SECTION)
- sh->sym = s;
+ sh->sym = s;
return 1;
}
@@ -167,11 +172,11 @@ static struct symbol *find_symbol_by_index(struct elf *elf, unsigned int idx)
struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset)
{
struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree;
- struct symbol *iter;
+ struct symbol *sym;
- __sym_for_each(iter, tree, offset, offset) {
- if (iter->offset == offset && iter->type != STT_SECTION)
- return iter;
+ __sym_for_each(sym, tree, offset, offset) {
+ if (sym->offset == offset && !is_sec_sym(sym))
+ return sym->alias;
}
return NULL;
@@ -180,11 +185,11 @@ struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset)
struct symbol *find_func_by_offset(struct section *sec, unsigned long offset)
{
struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree;
- struct symbol *iter;
+ struct symbol *func;
- __sym_for_each(iter, tree, offset, offset) {
- if (iter->offset == offset && iter->type == STT_FUNC)
- return iter;
+ __sym_for_each(func, tree, offset, offset) {
+ if (func->offset == offset && is_func_sym(func))
+ return func->alias;
}
return NULL;
@@ -193,14 +198,29 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset)
struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset)
{
struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree;
- struct symbol *iter;
+ struct symbol *sym = NULL, *tmp;
- __sym_for_each(iter, tree, offset, offset) {
- if (iter->type != STT_SECTION)
- return iter;
+ __sym_for_each(tmp, tree, offset, offset) {
+ if (tmp->len) {
+ if (!sym) {
+ sym = tmp;
+ continue;
+ }
+
+ if (sym->offset != tmp->offset || sym->len != tmp->len) {
+ /*
+ * In the rare case of overlapping symbols,
+ * pick the smaller one.
+ *
+ * TODO: outlaw overlapping symbols
+ */
+ if (tmp->len < sym->len)
+ sym = tmp;
+ }
+ }
}
- return NULL;
+ return sym ? sym->alias : NULL;
}
/*
@@ -246,11 +266,11 @@ int find_symbol_hole_containing(const struct section *sec, unsigned long offset)
struct symbol *find_func_containing(struct section *sec, unsigned long offset)
{
struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree;
- struct symbol *iter;
+ struct symbol *func;
- __sym_for_each(iter, tree, offset, offset) {
- if (iter->type == STT_FUNC)
- return iter;
+ __sym_for_each(func, tree, offset, offset) {
+ if (is_func_sym(func))
+ return func->alias;
}
return NULL;
@@ -268,6 +288,35 @@ struct symbol *find_symbol_by_name(const struct elf *elf, const char *name)
return NULL;
}
+/* Find local symbol with matching STT_FILE */
+static struct symbol *find_local_symbol_by_file_and_name(const struct elf *elf,
+ struct symbol *file,
+ const char *name)
+{
+ struct symbol *sym;
+
+ elf_hash_for_each_possible(symbol_name, sym, name_hash, str_hash(name)) {
+ if (sym->bind == STB_LOCAL && sym->file == file &&
+ !strcmp(sym->name, name)) {
+ return sym;
+ }
+ }
+
+ return NULL;
+}
+
+struct symbol *find_global_symbol_by_name(const struct elf *elf, const char *name)
+{
+ struct symbol *sym;
+
+ elf_hash_for_each_possible(symbol_name, sym, name_hash, str_hash(name)) {
+ if (!strcmp(sym->name, name) && !is_local_sym(sym))
+ return sym;
+ }
+
+ return NULL;
+}
+
struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec,
unsigned long offset, unsigned int len)
{
@@ -358,14 +407,14 @@ static int read_sections(struct elf *elf)
return -1;
}
- if (sec->sh.sh_size != 0 && !is_dwarf_section(sec)) {
+ if (sec_size(sec) != 0 && !is_dwarf_section(sec)) {
sec->data = elf_getdata(s, NULL);
if (!sec->data) {
ERROR_ELF("elf_getdata");
return -1;
}
if (sec->data->d_off != 0 ||
- sec->data->d_size != sec->sh.sh_size) {
+ sec->data->d_size != sec_size(sec)) {
ERROR("unexpected data attributes for %s", sec->name);
return -1;
}
@@ -393,7 +442,38 @@ static int read_sections(struct elf *elf)
return 0;
}
-static void elf_add_symbol(struct elf *elf, struct symbol *sym)
+static const char *demangle_name(struct symbol *sym)
+{
+ char *str;
+
+ if (!is_local_sym(sym))
+ return sym->name;
+
+ if (!is_func_sym(sym) && !is_object_sym(sym))
+ return sym->name;
+
+ if (!strstarts(sym->name, "__UNIQUE_ID_") && !strchr(sym->name, '.'))
+ return sym->name;
+
+ str = strdup(sym->name);
+ if (!str) {
+ ERROR_GLIBC("strdup");
+ return NULL;
+ }
+
+ for (int i = strlen(str) - 1; i >= 0; i--) {
+ char c = str[i];
+
+ if (!isdigit(c) && c != '.') {
+ str[i + 1] = '\0';
+ break;
+ }
+ }
+
+ return str;
+}
+
+static int elf_add_symbol(struct elf *elf, struct symbol *sym)
{
struct list_head *entry;
struct rb_node *pnode;
@@ -405,14 +485,15 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym)
sym->type = GELF_ST_TYPE(sym->sym.st_info);
sym->bind = GELF_ST_BIND(sym->sym.st_info);
- if (sym->type == STT_FILE)
+ if (is_file_sym(sym))
elf->num_files++;
sym->offset = sym->sym.st_value;
sym->len = sym->sym.st_size;
__sym_for_each(iter, &sym->sec->symbol_tree, sym->offset, sym->offset) {
- if (iter->offset == sym->offset && iter->type == sym->type)
+ if (!is_undef_sym(iter) && iter->offset == sym->offset &&
+ iter->type == sym->type && iter->len == sym->len)
iter->alias = sym;
}
@@ -423,21 +504,44 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym)
else
entry = &sym->sec->symbol_list;
list_add(&sym->list, entry);
+
+ list_add_tail(&sym->global_list, &elf->symbols);
elf_hash_add(symbol, &sym->hash, sym->idx);
elf_hash_add(symbol_name, &sym->name_hash, str_hash(sym->name));
- /*
- * Don't store empty STT_NOTYPE symbols in the rbtree. They
- * can exist within a function, confusing the sorting.
- */
- if (!sym->len)
- __sym_remove(sym, &sym->sec->symbol_tree);
+ if (is_func_sym(sym) &&
+ (strstarts(sym->name, "__pfx_") ||
+ strstarts(sym->name, "__cfi_") ||
+ strstarts(sym->name, "__pi___pfx_") ||
+ strstarts(sym->name, "__pi___cfi_")))
+ sym->prefix = 1;
+
+ if (strstarts(sym->name, ".klp.sym"))
+ sym->klp = 1;
+
+ if (!sym->klp && !is_sec_sym(sym) && strstr(sym->name, ".cold")) {
+ sym->cold = 1;
+
+ /*
+ * Clang doesn't mark cold subfunctions as STT_FUNC, which
+ * breaks several objtool assumptions. Fake it.
+ */
+ sym->type = STT_FUNC;
+ }
+
+ sym->pfunc = sym->cfunc = sym;
+
+ sym->demangled_name = demangle_name(sym);
+ if (!sym->demangled_name)
+ return -1;
+
+ return 0;
}
static int read_symbols(struct elf *elf)
{
struct section *symtab, *symtab_shndx, *sec;
- struct symbol *sym, *pfunc;
+ struct symbol *sym, *pfunc, *file = NULL;
int symbols_nr, i;
char *coldstr;
Elf_Data *shndx_data = NULL;
@@ -469,6 +573,9 @@ static int read_symbols(struct elf *elf)
ERROR_GLIBC("calloc");
return -1;
}
+
+ INIT_LIST_HEAD(&elf->symbols);
+
for (i = 0; i < symbols_nr; i++) {
sym = &elf->symbol_data[i];
@@ -477,14 +584,14 @@ static int read_symbols(struct elf *elf)
if (!gelf_getsymshndx(symtab->data, shndx_data, i, &sym->sym,
&shndx)) {
ERROR_ELF("gelf_getsymshndx");
- goto err;
+ return -1;
}
sym->name = elf_strptr(elf->elf, symtab->sh.sh_link,
sym->sym.st_name);
if (!sym->name) {
ERROR_ELF("elf_strptr");
- goto err;
+ return -1;
}
if ((sym->sym.st_shndx > SHN_UNDEF &&
@@ -496,7 +603,7 @@ static int read_symbols(struct elf *elf)
sym->sec = find_section_by_index(elf, shndx);
if (!sym->sec) {
ERROR("couldn't find section for symbol %s", sym->name);
- goto err;
+ return -1;
}
if (GELF_ST_TYPE(sym->sym.st_info) == STT_SECTION) {
sym->name = sym->sec->name;
@@ -505,7 +612,13 @@ static int read_symbols(struct elf *elf)
} else
sym->sec = find_section_by_index(elf, 0);
- elf_add_symbol(elf, sym);
+ if (elf_add_symbol(elf, sym))
+ return -1;
+
+ if (sym->type == STT_FILE)
+ file = sym;
+ else if (sym->bind == STB_LOCAL)
+ sym->file = file;
}
if (opts.stats) {
@@ -518,18 +631,15 @@ static int read_symbols(struct elf *elf)
sec_for_each_sym(sec, sym) {
char *pname;
size_t pnamelen;
- if (sym->type != STT_FUNC)
- continue;
-
- if (sym->pfunc == NULL)
- sym->pfunc = sym;
- if (sym->cfunc == NULL)
- sym->cfunc = sym;
+ if (!sym->cold)
+ continue;
coldstr = strstr(sym->name, ".cold");
- if (!coldstr)
- continue;
+ if (!coldstr) {
+ ERROR("%s(): cold subfunction without \".cold\"?", sym->name);
+ return -1;
+ }
pnamelen = coldstr - sym->name;
pname = strndup(sym->name, pnamelen);
@@ -538,7 +648,9 @@ static int read_symbols(struct elf *elf)
return -1;
}
- pfunc = find_symbol_by_name(elf, pname);
+ pfunc = find_local_symbol_by_file_and_name(elf, sym->file, pname);
+ if (!pfunc)
+ pfunc = find_global_symbol_by_name(elf, pname);
free(pname);
if (!pfunc) {
@@ -546,8 +658,9 @@ static int read_symbols(struct elf *elf)
return -1;
}
- sym->pfunc = pfunc;
+ sym->pfunc = pfunc->alias;
pfunc->cfunc = sym;
+ pfunc->alias->cfunc = sym;
/*
* Unfortunately, -fnoreorder-functions puts the child
@@ -566,10 +679,6 @@ static int read_symbols(struct elf *elf)
}
return 0;
-
-err:
- free(sym);
- return -1;
}
static int mark_group_syms(struct elf *elf)
@@ -583,7 +692,7 @@ static int mark_group_syms(struct elf *elf)
return -1;
}
- list_for_each_entry(sec, &elf->sections, list) {
+ for_each_sec(elf, sec) {
if (sec->sh.sh_type == SHT_GROUP &&
sec->sh.sh_link == symtab->idx) {
sym = find_symbol_by_index(elf, sec->sh.sh_info);
@@ -624,7 +733,7 @@ static int elf_update_sym_relocs(struct elf *elf, struct symbol *sym)
static int elf_update_symbol(struct elf *elf, struct section *symtab,
struct section *symtab_shndx, struct symbol *sym)
{
- Elf32_Word shndx = sym->sec ? sym->sec->idx : SHN_UNDEF;
+ Elf32_Word shndx;
Elf_Data *symtab_data = NULL, *shndx_data = NULL;
Elf64_Xword entsize = symtab->sh.sh_entsize;
int max_idx, idx = sym->idx;
@@ -632,8 +741,7 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab,
bool is_special_shndx = sym->sym.st_shndx >= SHN_LORESERVE &&
sym->sym.st_shndx != SHN_XINDEX;
- if (is_special_shndx)
- shndx = sym->sym.st_shndx;
+ shndx = is_special_shndx ? sym->sym.st_shndx : sym->sec->idx;
s = elf_getscn(elf->elf, symtab->idx);
if (!s) {
@@ -731,7 +839,7 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab,
}
/* setup extended section index magic and write the symbol */
- if ((shndx >= SHN_UNDEF && shndx < SHN_LORESERVE) || is_special_shndx) {
+ if (shndx < SHN_LORESERVE || is_special_shndx) {
sym->sym.st_shndx = shndx;
if (!shndx_data)
shndx = 0;
@@ -751,24 +859,58 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab,
return 0;
}
-static struct symbol *
-__elf_create_symbol(struct elf *elf, struct symbol *sym)
+struct symbol *elf_create_symbol(struct elf *elf, const char *name,
+ struct section *sec, unsigned int bind,
+ unsigned int type, unsigned long offset,
+ size_t size)
{
struct section *symtab, *symtab_shndx;
Elf32_Word first_non_local, new_idx;
- struct symbol *old;
+ struct symbol *old, *sym;
- symtab = find_section_by_name(elf, ".symtab");
- if (symtab) {
- symtab_shndx = find_section_by_name(elf, ".symtab_shndx");
+ sym = calloc(1, sizeof(*sym));
+ if (!sym) {
+ ERROR_GLIBC("calloc");
+ return NULL;
+ }
+
+ sym->name = strdup(name);
+ if (!sym->name) {
+ ERROR_GLIBC("strdup");
+ return NULL;
+ }
+
+ if (type != STT_SECTION) {
+ sym->sym.st_name = elf_add_string(elf, NULL, sym->name);
+ if (sym->sym.st_name == -1)
+ return NULL;
+ }
+
+ if (sec) {
+ sym->sec = sec;
} else {
+ sym->sec = find_section_by_index(elf, 0);
+ if (!sym->sec) {
+ ERROR("no NULL section");
+ return NULL;
+ }
+ }
+
+ sym->sym.st_info = GELF_ST_INFO(bind, type);
+ sym->sym.st_value = offset;
+ sym->sym.st_size = size;
+
+ symtab = find_section_by_name(elf, ".symtab");
+ if (!symtab) {
ERROR("no .symtab");
return NULL;
}
+ symtab_shndx = find_section_by_name(elf, ".symtab_shndx");
+
new_idx = sec_num_entries(symtab);
- if (GELF_ST_BIND(sym->sym.st_info) != STB_LOCAL)
+ if (bind != STB_LOCAL)
goto non_local;
/*
@@ -806,10 +948,8 @@ __elf_create_symbol(struct elf *elf, struct symbol *sym)
non_local:
sym->idx = new_idx;
- if (elf_update_symbol(elf, symtab, symtab_shndx, sym)) {
- ERROR("elf_update_symbol");
+ if (sym->idx && elf_update_symbol(elf, symtab, symtab_shndx, sym))
return NULL;
- }
symtab->sh.sh_size += symtab->sh.sh_entsize;
mark_sec_changed(elf, symtab, true);
@@ -819,70 +959,28 @@ non_local:
mark_sec_changed(elf, symtab_shndx, true);
}
- return sym;
-}
-
-static struct symbol *
-elf_create_section_symbol(struct elf *elf, struct section *sec)
-{
- struct symbol *sym = calloc(1, sizeof(*sym));
-
- if (!sym) {
- ERROR_GLIBC("malloc");
+ if (elf_add_symbol(elf, sym))
return NULL;
- }
-
- sym->name = sec->name;
- sym->sec = sec;
-
- // st_name 0
- sym->sym.st_info = GELF_ST_INFO(STB_LOCAL, STT_SECTION);
- // st_other 0
- // st_value 0
- // st_size 0
-
- sym = __elf_create_symbol(elf, sym);
- if (sym)
- elf_add_symbol(elf, sym);
return sym;
}
-static int elf_add_string(struct elf *elf, struct section *strtab, char *str);
-
-struct symbol *
-elf_create_prefix_symbol(struct elf *elf, struct symbol *orig, long size)
+struct symbol *elf_create_section_symbol(struct elf *elf, struct section *sec)
{
struct symbol *sym = calloc(1, sizeof(*sym));
- size_t namelen = strlen(orig->name) + sizeof("__pfx_");
- char *name = malloc(namelen);
- if (!sym || !name) {
- ERROR_GLIBC("malloc");
+ sym = elf_create_symbol(elf, sec->name, sec, STB_LOCAL, STT_SECTION, 0, 0);
+ if (!sym)
return NULL;
- }
- snprintf(name, namelen, "__pfx_%s", orig->name);
-
- sym->name = name;
- sym->sec = orig->sec;
-
- sym->sym.st_name = elf_add_string(elf, NULL, name);
- sym->sym.st_info = orig->sym.st_info;
- sym->sym.st_value = orig->sym.st_value - size;
- sym->sym.st_size = size;
-
- sym = __elf_create_symbol(elf, sym);
- if (sym)
- elf_add_symbol(elf, sym);
+ sec->sym = sym;
return sym;
}
-static struct reloc *elf_init_reloc(struct elf *elf, struct section *rsec,
- unsigned int reloc_idx,
- unsigned long offset, struct symbol *sym,
- s64 addend, unsigned int type)
+struct reloc *elf_init_reloc(struct elf *elf, struct section *rsec,
+ unsigned int reloc_idx, unsigned long offset,
+ struct symbol *sym, s64 addend, unsigned int type)
{
struct reloc *reloc, empty = { 0 };
@@ -922,9 +1020,9 @@ struct reloc *elf_init_reloc_text_sym(struct elf *elf, struct section *sec,
unsigned long insn_off)
{
struct symbol *sym = insn_sec->sym;
- int addend = insn_off;
+ s64 addend = insn_off;
- if (!(insn_sec->sh.sh_flags & SHF_EXECINSTR)) {
+ if (!is_text_sec(insn_sec)) {
ERROR("bad call to %s() for data symbol %s", __func__, sym->name);
return NULL;
}
@@ -939,8 +1037,6 @@ struct reloc *elf_init_reloc_text_sym(struct elf *elf, struct section *sec,
sym = elf_create_section_symbol(elf, insn_sec);
if (!sym)
return NULL;
-
- insn_sec->sym = sym;
}
return elf_init_reloc(elf, sec->rsec, reloc_idx, offset, sym, addend,
@@ -953,7 +1049,7 @@ struct reloc *elf_init_reloc_data_sym(struct elf *elf, struct section *sec,
struct symbol *sym,
s64 addend)
{
- if (sym->sec && (sec->sh.sh_flags & SHF_EXECINSTR)) {
+ if (is_text_sec(sec)) {
ERROR("bad call to %s() for text symbol %s", __func__, sym->name);
return NULL;
}
@@ -986,12 +1082,16 @@ static int read_relocs(struct elf *elf)
rsec->base->rsec = rsec;
- nr_reloc = 0;
+ /* nr_alloc_relocs=0: libelf owns d_buf */
+ rsec->nr_alloc_relocs = 0;
+
rsec->relocs = calloc(sec_num_entries(rsec), sizeof(*reloc));
if (!rsec->relocs) {
ERROR_GLIBC("calloc");
return -1;
}
+
+ nr_reloc = 0;
for (i = 0; i < sec_num_entries(rsec); i++) {
reloc = &rsec->relocs[i];
@@ -1044,6 +1144,12 @@ struct elf *elf_open_read(const char *name, int flags)
goto err;
}
+ elf->name = strdup(name);
+ if (!elf->name) {
+ ERROR_GLIBC("strdup");
+ return NULL;
+ }
+
if ((flags & O_ACCMODE) == O_RDONLY)
cmd = ELF_C_READ_MMAP;
else if ((flags & O_ACCMODE) == O_RDWR)
@@ -1081,11 +1187,142 @@ err:
return NULL;
}
-static int elf_add_string(struct elf *elf, struct section *strtab, char *str)
+struct elf *elf_create_file(GElf_Ehdr *ehdr, const char *name)
{
- Elf_Data *data;
- Elf_Scn *s;
- int len;
+ struct section *null, *symtab, *strtab, *shstrtab;
+ char *dir, *base, *tmp_name;
+ struct symbol *sym;
+ struct elf *elf;
+
+ elf_version(EV_CURRENT);
+
+ elf = calloc(1, sizeof(*elf));
+ if (!elf) {
+ ERROR_GLIBC("calloc");
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&elf->sections);
+
+ dir = strdup(name);
+ if (!dir) {
+ ERROR_GLIBC("strdup");
+ return NULL;
+ }
+
+ dir = dirname(dir);
+
+ base = strdup(name);
+ if (!base) {
+ ERROR_GLIBC("strdup");
+ return NULL;
+ }
+
+ base = basename(base);
+
+ tmp_name = malloc(256);
+ if (!tmp_name) {
+ ERROR_GLIBC("malloc");
+ return NULL;
+ }
+
+ snprintf(tmp_name, 256, "%s/%s.XXXXXX", dir, base);
+
+ elf->fd = mkstemp(tmp_name);
+ if (elf->fd == -1) {
+ ERROR_GLIBC("can't create tmp file");
+ exit(1);
+ }
+
+ elf->tmp_name = tmp_name;
+
+ elf->name = strdup(name);
+ if (!elf->name) {
+ ERROR_GLIBC("strdup");
+ return NULL;
+ }
+
+ elf->elf = elf_begin(elf->fd, ELF_C_WRITE, NULL);
+ if (!elf->elf) {
+ ERROR_ELF("elf_begin");
+ return NULL;
+ }
+
+ if (!gelf_newehdr(elf->elf, ELFCLASS64)) {
+ ERROR_ELF("gelf_newehdr");
+ return NULL;
+ }
+
+ memcpy(&elf->ehdr, ehdr, sizeof(elf->ehdr));
+
+ if (!gelf_update_ehdr(elf->elf, &elf->ehdr)) {
+ ERROR_ELF("gelf_update_ehdr");
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&elf->symbols);
+
+ if (!elf_alloc_hash(section, 1000) ||
+ !elf_alloc_hash(section_name, 1000) ||
+ !elf_alloc_hash(symbol, 10000) ||
+ !elf_alloc_hash(symbol_name, 10000) ||
+ !elf_alloc_hash(reloc, 100000))
+ return NULL;
+
+ null = elf_create_section(elf, NULL, 0, 0, SHT_NULL, 0, 0);
+ shstrtab = elf_create_section(elf, NULL, 0, 0, SHT_STRTAB, 1, 0);
+ strtab = elf_create_section(elf, NULL, 0, 0, SHT_STRTAB, 1, 0);
+
+ if (!null || !shstrtab || !strtab)
+ return NULL;
+
+ null->name = "";
+ shstrtab->name = ".shstrtab";
+ strtab->name = ".strtab";
+
+ null->sh.sh_name = elf_add_string(elf, shstrtab, null->name);
+ shstrtab->sh.sh_name = elf_add_string(elf, shstrtab, shstrtab->name);
+ strtab->sh.sh_name = elf_add_string(elf, shstrtab, strtab->name);
+
+ if (null->sh.sh_name == -1 || shstrtab->sh.sh_name == -1 || strtab->sh.sh_name == -1)
+ return NULL;
+
+ elf_hash_add(section_name, &null->name_hash, str_hash(null->name));
+ elf_hash_add(section_name, &strtab->name_hash, str_hash(strtab->name));
+ elf_hash_add(section_name, &shstrtab->name_hash, str_hash(shstrtab->name));
+
+ if (elf_add_string(elf, strtab, "") == -1)
+ return NULL;
+
+ symtab = elf_create_section(elf, ".symtab", 0x18, 0x18, SHT_SYMTAB, 0x8, 0);
+ if (!symtab)
+ return NULL;
+
+ symtab->sh.sh_link = strtab->idx;
+ symtab->sh.sh_info = 1;
+
+ elf->ehdr.e_shstrndx = shstrtab->idx;
+ if (!gelf_update_ehdr(elf->elf, &elf->ehdr)) {
+ ERROR_ELF("gelf_update_ehdr");
+ return NULL;
+ }
+
+ sym = calloc(1, sizeof(*sym));
+ if (!sym) {
+ ERROR_GLIBC("calloc");
+ return NULL;
+ }
+
+ sym->name = "";
+ sym->sec = null;
+ elf_add_symbol(elf, sym);
+
+ return elf;
+}
+
+unsigned int elf_add_string(struct elf *elf, struct section *strtab, const char *str)
+{
+ unsigned int offset;
if (!strtab)
strtab = find_section_by_name(elf, ".strtab");
@@ -1094,76 +1331,109 @@ static int elf_add_string(struct elf *elf, struct section *strtab, char *str)
return -1;
}
- s = elf_getscn(elf->elf, strtab->idx);
+ if (!strtab->sh.sh_addralign) {
+ ERROR("'%s': invalid sh_addralign", strtab->name);
+ return -1;
+ }
+
+ offset = ALIGN_UP(strtab->sh.sh_size, strtab->sh.sh_addralign);
+
+ if (!elf_add_data(elf, strtab, str, strlen(str) + 1))
+ return -1;
+
+ return offset;
+}
+
+void *elf_add_data(struct elf *elf, struct section *sec, const void *data, size_t size)
+{
+ unsigned long offset;
+ Elf_Scn *s;
+
+ if (!sec->sh.sh_addralign) {
+ ERROR("'%s': invalid sh_addralign", sec->name);
+ return NULL;
+ }
+
+ s = elf_getscn(elf->elf, sec->idx);
if (!s) {
ERROR_ELF("elf_getscn");
- return -1;
+ return NULL;
}
- data = elf_newdata(s);
- if (!data) {
+ sec->data = elf_newdata(s);
+ if (!sec->data) {
ERROR_ELF("elf_newdata");
- return -1;
+ return NULL;
}
- data->d_buf = str;
- data->d_size = strlen(str) + 1;
- data->d_align = 1;
+ sec->data->d_buf = calloc(1, size);
+ if (!sec->data->d_buf) {
+ ERROR_GLIBC("calloc");
+ return NULL;
+ }
- len = strtab->sh.sh_size;
- strtab->sh.sh_size += data->d_size;
+ if (data)
+ memcpy(sec->data->d_buf, data, size);
- mark_sec_changed(elf, strtab, true);
+ sec->data->d_size = size;
+ sec->data->d_align = 1;
- return len;
+ offset = ALIGN_UP(sec->sh.sh_size, sec->sh.sh_addralign);
+ sec->sh.sh_size = offset + size;
+
+ mark_sec_changed(elf, sec, true);
+
+ return sec->data->d_buf;
}
struct section *elf_create_section(struct elf *elf, const char *name,
- size_t entsize, unsigned int nr)
+ size_t size, size_t entsize,
+ unsigned int type, unsigned int align,
+ unsigned int flags)
{
struct section *sec, *shstrtab;
- size_t size = entsize * nr;
Elf_Scn *s;
- sec = malloc(sizeof(*sec));
+ if (name && find_section_by_name(elf, name)) {
+ ERROR("section '%s' already exists", name);
+ return NULL;
+ }
+
+ sec = calloc(1, sizeof(*sec));
if (!sec) {
- ERROR_GLIBC("malloc");
+ ERROR_GLIBC("calloc");
return NULL;
}
- memset(sec, 0, sizeof(*sec));
INIT_LIST_HEAD(&sec->symbol_list);
+ /* don't actually create the section, just the data structures */
+ if (type == SHT_NULL)
+ goto add;
+
s = elf_newscn(elf->elf);
if (!s) {
ERROR_ELF("elf_newscn");
return NULL;
}
- sec->name = strdup(name);
- if (!sec->name) {
- ERROR_GLIBC("strdup");
- return NULL;
- }
-
sec->idx = elf_ndxscn(s);
- sec->data = elf_newdata(s);
- if (!sec->data) {
- ERROR_ELF("elf_newdata");
- return NULL;
- }
+ if (size) {
+ sec->data = elf_newdata(s);
+ if (!sec->data) {
+ ERROR_ELF("elf_newdata");
+ return NULL;
+ }
- sec->data->d_size = size;
- sec->data->d_align = 1;
+ sec->data->d_size = size;
+ sec->data->d_align = 1;
- if (size) {
- sec->data->d_buf = malloc(size);
+ sec->data->d_buf = calloc(1, size);
if (!sec->data->d_buf) {
- ERROR_GLIBC("malloc");
+ ERROR_GLIBC("calloc");
return NULL;
}
- memset(sec->data->d_buf, 0, size);
}
if (!gelf_getshdr(s, &sec->sh)) {
@@ -1173,34 +1443,152 @@ struct section *elf_create_section(struct elf *elf, const char *name,
sec->sh.sh_size = size;
sec->sh.sh_entsize = entsize;
- sec->sh.sh_type = SHT_PROGBITS;
- sec->sh.sh_addralign = 1;
- sec->sh.sh_flags = SHF_ALLOC;
-
- /* Add section name to .shstrtab (or .strtab for Clang) */
- shstrtab = find_section_by_name(elf, ".shstrtab");
- if (!shstrtab)
- shstrtab = find_section_by_name(elf, ".strtab");
- if (!shstrtab) {
- ERROR("can't find .shstrtab or .strtab section");
- return NULL;
+ sec->sh.sh_type = type;
+ sec->sh.sh_addralign = align;
+ sec->sh.sh_flags = flags;
+
+ if (name) {
+ sec->name = strdup(name);
+ if (!sec->name) {
+ ERROR("strdup");
+ return NULL;
+ }
+
+ /* Add section name to .shstrtab (or .strtab for Clang) */
+ shstrtab = find_section_by_name(elf, ".shstrtab");
+ if (!shstrtab) {
+ shstrtab = find_section_by_name(elf, ".strtab");
+ if (!shstrtab) {
+ ERROR("can't find .shstrtab or .strtab");
+ return NULL;
+ }
+ }
+ sec->sh.sh_name = elf_add_string(elf, shstrtab, sec->name);
+ if (sec->sh.sh_name == -1)
+ return NULL;
+
+ elf_hash_add(section_name, &sec->name_hash, str_hash(sec->name));
}
- sec->sh.sh_name = elf_add_string(elf, shstrtab, sec->name);
- if (sec->sh.sh_name == -1)
- return NULL;
+add:
list_add_tail(&sec->list, &elf->sections);
elf_hash_add(section, &sec->hash, sec->idx);
- elf_hash_add(section_name, &sec->name_hash, str_hash(sec->name));
mark_sec_changed(elf, sec, true);
return sec;
}
-static struct section *elf_create_rela_section(struct elf *elf,
- struct section *sec,
- unsigned int reloc_nr)
+static int elf_alloc_reloc(struct elf *elf, struct section *rsec)
+{
+ struct reloc *old_relocs, *old_relocs_end, *new_relocs;
+ unsigned int nr_relocs_old = sec_num_entries(rsec);
+ unsigned int nr_relocs_new = nr_relocs_old + 1;
+ unsigned long nr_alloc;
+ struct symbol *sym;
+
+ if (!rsec->data) {
+ rsec->data = elf_newdata(elf_getscn(elf->elf, rsec->idx));
+ if (!rsec->data) {
+ ERROR_ELF("elf_newdata");
+ return -1;
+ }
+
+ rsec->data->d_align = 1;
+ rsec->data->d_type = ELF_T_RELA;
+ rsec->data->d_buf = NULL;
+ }
+
+ rsec->data->d_size = nr_relocs_new * elf_rela_size(elf);
+ rsec->sh.sh_size = rsec->data->d_size;
+
+ nr_alloc = MAX(64, ALIGN_UP_POW2(nr_relocs_new));
+ if (nr_alloc <= rsec->nr_alloc_relocs)
+ return 0;
+
+ if (rsec->data->d_buf && !rsec->nr_alloc_relocs) {
+ void *orig_buf = rsec->data->d_buf;
+
+ /*
+ * The original d_buf is owned by libelf so it can't be
+ * realloced.
+ */
+ rsec->data->d_buf = malloc(nr_alloc * elf_rela_size(elf));
+ if (!rsec->data->d_buf) {
+ ERROR_GLIBC("malloc");
+ return -1;
+ }
+ memcpy(rsec->data->d_buf, orig_buf,
+ nr_relocs_old * elf_rela_size(elf));
+ } else {
+ rsec->data->d_buf = realloc(rsec->data->d_buf,
+ nr_alloc * elf_rela_size(elf));
+ if (!rsec->data->d_buf) {
+ ERROR_GLIBC("realloc");
+ return -1;
+ }
+ }
+
+ rsec->nr_alloc_relocs = nr_alloc;
+
+ old_relocs = rsec->relocs;
+ new_relocs = calloc(nr_alloc, sizeof(struct reloc));
+ if (!new_relocs) {
+ ERROR_GLIBC("calloc");
+ return -1;
+ }
+
+ if (!old_relocs)
+ goto done;
+
+ /*
+ * The struct reloc's address has changed. Update all the symbols and
+ * relocs which reference it.
+ */
+
+ old_relocs_end = &old_relocs[nr_relocs_old];
+ for_each_sym(elf, sym) {
+ struct reloc *reloc;
+
+ reloc = sym->relocs;
+ if (!reloc)
+ continue;
+
+ if (reloc >= old_relocs && reloc < old_relocs_end)
+ sym->relocs = &new_relocs[reloc - old_relocs];
+
+ while (1) {
+ struct reloc *next_reloc = sym_next_reloc(reloc);
+
+ if (!next_reloc)
+ break;
+
+ if (next_reloc >= old_relocs && next_reloc < old_relocs_end)
+ set_sym_next_reloc(reloc, &new_relocs[next_reloc - old_relocs]);
+
+ reloc = next_reloc;
+ }
+ }
+
+ memcpy(new_relocs, old_relocs, nr_relocs_old * sizeof(struct reloc));
+
+ for (int i = 0; i < nr_relocs_old; i++) {
+ struct reloc *old = &old_relocs[i];
+ struct reloc *new = &new_relocs[i];
+ u32 key = reloc_hash(old);
+
+ elf_hash_del(reloc, &old->hash, key);
+ elf_hash_add(reloc, &new->hash, key);
+ }
+
+ free(old_relocs);
+done:
+ rsec->relocs = new_relocs;
+ return 0;
+}
+
+struct section *elf_create_rela_section(struct elf *elf, struct section *sec,
+ unsigned int nr_relocs)
{
struct section *rsec;
char *rsec_name;
@@ -1213,41 +1601,72 @@ static struct section *elf_create_rela_section(struct elf *elf,
strcpy(rsec_name, ".rela");
strcat(rsec_name, sec->name);
- rsec = elf_create_section(elf, rsec_name, elf_rela_size(elf), reloc_nr);
+ rsec = elf_create_section(elf, rsec_name, nr_relocs * elf_rela_size(elf),
+ elf_rela_size(elf), SHT_RELA, elf_addr_size(elf),
+ SHF_INFO_LINK);
free(rsec_name);
if (!rsec)
return NULL;
- rsec->data->d_type = ELF_T_RELA;
- rsec->sh.sh_type = SHT_RELA;
- rsec->sh.sh_addralign = elf_addr_size(elf);
- rsec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx;
- rsec->sh.sh_info = sec->idx;
- rsec->sh.sh_flags = SHF_INFO_LINK;
+ if (nr_relocs) {
+ rsec->data->d_type = ELF_T_RELA;
- rsec->relocs = calloc(sec_num_entries(rsec), sizeof(struct reloc));
- if (!rsec->relocs) {
- ERROR_GLIBC("calloc");
- return NULL;
+ rsec->nr_alloc_relocs = nr_relocs;
+ rsec->relocs = calloc(nr_relocs, sizeof(struct reloc));
+ if (!rsec->relocs) {
+ ERROR_GLIBC("calloc");
+ return NULL;
+ }
}
+ rsec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx;
+ rsec->sh.sh_info = sec->idx;
+
sec->rsec = rsec;
rsec->base = sec;
return rsec;
}
+struct reloc *elf_create_reloc(struct elf *elf, struct section *sec,
+ unsigned long offset,
+ struct symbol *sym, s64 addend,
+ unsigned int type)
+{
+ struct section *rsec = sec->rsec;
+
+ if (!rsec) {
+ rsec = elf_create_rela_section(elf, sec, 0);
+ if (!rsec)
+ return NULL;
+ }
+
+ if (find_reloc_by_dest(elf, sec, offset)) {
+ ERROR_FUNC(sec, offset, "duplicate reloc");
+ return NULL;
+ }
+
+ if (elf_alloc_reloc(elf, rsec))
+ return NULL;
+
+ mark_sec_changed(elf, rsec, true);
+
+ return elf_init_reloc(elf, rsec, sec_num_entries(rsec) - 1, offset, sym,
+ addend, type);
+}
+
struct section *elf_create_section_pair(struct elf *elf, const char *name,
size_t entsize, unsigned int nr,
- unsigned int reloc_nr)
+ unsigned int nr_relocs)
{
struct section *sec;
- sec = elf_create_section(elf, name, entsize, nr);
+ sec = elf_create_section(elf, name, nr * entsize, entsize,
+ SHT_PROGBITS, 1, SHF_ALLOC);
if (!sec)
return NULL;
- if (!elf_create_rela_section(elf, sec, reloc_nr))
+ if (!elf_create_rela_section(elf, sec, nr_relocs))
return NULL;
return sec;
@@ -1282,7 +1701,7 @@ int elf_write_insn(struct elf *elf, struct section *sec,
*/
static int elf_truncate_section(struct elf *elf, struct section *sec)
{
- u64 size = sec->sh.sh_size;
+ u64 size = sec_size(sec);
bool truncated = false;
Elf_Data *data = NULL;
Elf_Scn *s;
@@ -1296,7 +1715,6 @@ static int elf_truncate_section(struct elf *elf, struct section *sec)
for (;;) {
/* get next data descriptor for the relevant section */
data = elf_getdata(s, data);
-
if (!data) {
if (size) {
ERROR("end of section data but non-zero size left\n");
@@ -1332,8 +1750,8 @@ int elf_write(struct elf *elf)
/* Update changed relocation sections and section headers: */
list_for_each_entry(sec, &elf->sections, list) {
- if (sec->truncate)
- elf_truncate_section(elf, sec);
+ if (sec->truncate && elf_truncate_section(elf, sec))
+ return -1;
if (sec_changed(sec)) {
s = elf_getscn(elf->elf, sec->idx);
@@ -1366,7 +1784,7 @@ int elf_write(struct elf *elf)
return 0;
}
-void elf_close(struct elf *elf)
+int elf_close(struct elf *elf)
{
if (elf->elf)
elf_end(elf->elf);
@@ -1374,8 +1792,12 @@ void elf_close(struct elf *elf)
if (elf->fd > 0)
close(elf->fd);
+ if (elf->tmp_name && rename(elf->tmp_name, elf->name))
+ return -1;
+
/*
* NOTE: All remaining allocations are leaked on purpose. Objtool is
* about to exit anyway.
*/
+ return 0;
}
diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h
index be33c7b43180..8866158975fc 100644
--- a/tools/objtool/include/objtool/arch.h
+++ b/tools/objtool/include/objtool/arch.h
@@ -71,7 +71,7 @@ struct stack_op {
struct instruction;
-int arch_ftrace_match(char *name);
+int arch_ftrace_match(const char *name);
void arch_initial_func_cfi_state(struct cfi_init_state *state);
@@ -83,7 +83,8 @@ bool arch_callee_saved_reg(unsigned char reg);
unsigned long arch_jump_destination(struct instruction *insn);
-unsigned long arch_dest_reloc_offset(int addend);
+s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc);
+u64 arch_adjusted_addend(struct reloc *reloc);
const char *arch_nop_insn(int len);
const char *arch_ret_insn(int len);
@@ -102,4 +103,15 @@ bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc);
unsigned int arch_reloc_size(struct reloc *reloc);
unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table);
+extern const char *arch_reg_name[CFI_NUM_REGS];
+
+#ifdef DISAS
+
+#include <bfd.h>
+#include <dis-asm.h>
+
+int arch_disas_info_init(struct disassemble_info *dinfo);
+
+#endif /* DISAS */
+
#endif /* _ARCH_H */
diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
index ab22673862e1..b9e229ed4dc0 100644
--- a/tools/objtool/include/objtool/builtin.h
+++ b/tools/objtool/include/objtool/builtin.h
@@ -9,12 +9,15 @@
struct opts {
/* actions: */
+ bool cfi;
+ bool checksum;
bool dump_orc;
bool hack_jump_label;
bool hack_noinstr;
bool hack_skylake;
bool ibt;
bool mcount;
+ bool noabs;
bool noinstr;
bool orc;
bool retpoline;
@@ -25,11 +28,12 @@ struct opts {
bool static_call;
bool uaccess;
int prefix;
- bool cfi;
- bool noabs;
+ const char *disas;
/* options: */
bool backtrace;
+ bool backup;
+ const char *debug_checksum;
bool dryrun;
bool link;
bool mnop;
@@ -38,8 +42,10 @@ struct opts {
const char *output;
bool sec_address;
bool stats;
+ const char *trace;
bool verbose;
bool werror;
+ bool wide;
};
extern struct opts opts;
@@ -48,6 +54,8 @@ int cmd_parse_options(int argc, const char **argv, const char * const usage[]);
int objtool_run(int argc, const char **argv);
-void print_args(void);
+int make_backup(void);
+
+int cmd_klp(int argc, const char **argv);
#endif /* _BUILTIN_H */
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index 00fb745e7233..2e1346ad5e92 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -36,6 +36,19 @@ struct alt_group {
struct cfi_state **cfi;
bool ignore;
+ unsigned int feature;
+};
+
+enum alternative_type {
+ ALT_TYPE_INSTRUCTIONS,
+ ALT_TYPE_JUMP_TABLE,
+ ALT_TYPE_EX_TABLE,
+};
+
+struct alternative {
+ struct alternative *next;
+ struct instruction *insn;
+ enum alternative_type type;
};
#define INSN_CHUNK_BITS 8
@@ -64,8 +77,11 @@ struct instruction {
noendbr : 1,
unret : 1,
visited : 4,
- no_reloc : 1;
- /* 10 bit hole */
+ no_reloc : 1,
+ hole : 1,
+ fake : 1,
+ trace : 1;
+ /* 9 bit hole */
struct alt_group *alt_group;
struct instruction *jump_dest;
@@ -115,6 +131,15 @@ static inline bool is_jump(struct instruction *insn)
return is_static_jump(insn) || is_dynamic_jump(insn);
}
+static inline struct symbol *insn_call_dest(struct instruction *insn)
+{
+ if (insn->type == INSN_JUMP_DYNAMIC ||
+ insn->type == INSN_CALL_DYNAMIC)
+ return NULL;
+
+ return insn->_call_dest;
+}
+
struct instruction *find_insn(struct objtool_file *file,
struct section *sec, unsigned long offset);
@@ -125,4 +150,14 @@ struct instruction *next_insn_same_sec(struct objtool_file *file, struct instruc
insn && insn->sec == _sec; \
insn = next_insn_same_sec(file, insn))
+#define sym_for_each_insn(file, sym, insn) \
+ for (insn = find_insn(file, sym->sec, sym->offset); \
+ insn && insn->offset < sym->offset + sym->len; \
+ insn = next_insn_same_sec(file, insn))
+
+const char *objtool_disas_insn(struct instruction *insn);
+
+extern size_t sym_name_max_len;
+extern struct disas_context *objtool_disas_ctx;
+
#endif /* _CHECK_H */
diff --git a/tools/objtool/include/objtool/checksum.h b/tools/objtool/include/objtool/checksum.h
new file mode 100644
index 000000000000..7fe21608722a
--- /dev/null
+++ b/tools/objtool/include/objtool/checksum.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _OBJTOOL_CHECKSUM_H
+#define _OBJTOOL_CHECKSUM_H
+
+#include <objtool/elf.h>
+
+#ifdef BUILD_KLP
+
+static inline void checksum_init(struct symbol *func)
+{
+ if (func && !func->csum.state) {
+ func->csum.state = XXH3_createState();
+ XXH3_64bits_reset(func->csum.state);
+ }
+}
+
+static inline void checksum_update(struct symbol *func,
+ struct instruction *insn,
+ const void *data, size_t size)
+{
+ XXH3_64bits_update(func->csum.state, data, size);
+ dbg_checksum(func, insn, XXH3_64bits_digest(func->csum.state));
+}
+
+static inline void checksum_finish(struct symbol *func)
+{
+ if (func && func->csum.state) {
+ func->csum.checksum = XXH3_64bits_digest(func->csum.state);
+ func->csum.state = NULL;
+ }
+}
+
+#else /* !BUILD_KLP */
+
+static inline void checksum_init(struct symbol *func) {}
+static inline void checksum_update(struct symbol *func,
+ struct instruction *insn,
+ const void *data, size_t size) {}
+static inline void checksum_finish(struct symbol *func) {}
+
+#endif /* !BUILD_KLP */
+
+#endif /* _OBJTOOL_CHECKSUM_H */
diff --git a/tools/objtool/include/objtool/checksum_types.h b/tools/objtool/include/objtool/checksum_types.h
new file mode 100644
index 000000000000..507efdd8ab5b
--- /dev/null
+++ b/tools/objtool/include/objtool/checksum_types.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _OBJTOOL_CHECKSUM_TYPES_H
+#define _OBJTOOL_CHECKSUM_TYPES_H
+
+struct sym_checksum {
+ u64 addr;
+ u64 checksum;
+};
+
+#ifdef BUILD_KLP
+
+#include <xxhash.h>
+
+struct checksum {
+ XXH3_state_t *state;
+ XXH64_hash_t checksum;
+};
+
+#else /* !BUILD_KLP */
+
+struct checksum {};
+
+#endif /* !BUILD_KLP */
+
+#endif /* _OBJTOOL_CHECKSUM_TYPES_H */
diff --git a/tools/objtool/include/objtool/disas.h b/tools/objtool/include/objtool/disas.h
new file mode 100644
index 000000000000..e8f395eff159
--- /dev/null
+++ b/tools/objtool/include/objtool/disas.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates.
+ */
+
+#ifndef _DISAS_H
+#define _DISAS_H
+
+struct alternative;
+struct disas_context;
+struct disassemble_info;
+
+#ifdef DISAS
+
+struct disas_context *disas_context_create(struct objtool_file *file);
+void disas_context_destroy(struct disas_context *dctx);
+void disas_warned_funcs(struct disas_context *dctx);
+void disas_funcs(struct disas_context *dctx);
+int disas_info_init(struct disassemble_info *dinfo,
+ int arch, int mach32, int mach64,
+ const char *options);
+size_t disas_insn(struct disas_context *dctx, struct instruction *insn);
+char *disas_result(struct disas_context *dctx);
+void disas_print_info(FILE *stream, struct instruction *insn, int depth,
+ const char *format, ...);
+void disas_print_insn(FILE *stream, struct disas_context *dctx,
+ struct instruction *insn, int depth,
+ const char *format, ...);
+char *disas_alt_name(struct alternative *alt);
+const char *disas_alt_type_name(struct instruction *insn);
+
+#else /* DISAS */
+
+#include <objtool/warn.h>
+
+static inline struct disas_context *disas_context_create(struct objtool_file *file)
+{
+ WARN("Rebuild with libopcodes for disassembly support");
+ return NULL;
+}
+
+static inline void disas_context_destroy(struct disas_context *dctx) {}
+static inline void disas_warned_funcs(struct disas_context *dctx) {}
+static inline void disas_funcs(struct disas_context *dctx) {}
+
+static inline int disas_info_init(struct disassemble_info *dinfo,
+ int arch, int mach32, int mach64,
+ const char *options)
+{
+ return -1;
+}
+
+static inline size_t disas_insn(struct disas_context *dctx,
+ struct instruction *insn)
+{
+ return -1;
+}
+
+static inline char *disas_result(struct disas_context *dctx)
+{
+ return NULL;
+}
+
+static inline void disas_print_info(FILE *stream, struct instruction *insn,
+ int depth, const char *format, ...) {}
+static inline void disas_print_insn(FILE *stream, struct disas_context *dctx,
+ struct instruction *insn, int depth,
+ const char *format, ...) {}
+static inline char *disas_alt_name(struct alternative *alt)
+{
+ return NULL;
+}
+
+static inline const char *disas_alt_type_name(struct instruction *insn)
+{
+ return NULL;
+}
+
+#endif /* DISAS */
+
+#endif /* _DISAS_H */
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index df8434d3b744..e12c516bd320 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -8,12 +8,21 @@
#include <stdio.h>
#include <gelf.h>
+#include <linux/string.h>
#include <linux/list.h>
#include <linux/hashtable.h>
#include <linux/rbtree.h>
#include <linux/jhash.h>
+
+#include <objtool/endianness.h>
+#include <objtool/checksum_types.h>
#include <arch/elf.h>
+#define SEC_NAME_LEN 1024
+#define SYM_NAME_LEN 512
+
+#define bswap_if_needed(elf, val) __bswap_if_needed(&elf->ehdr, val)
+
#ifdef LIBELF_USE_DEPRECATED
# define elf_getshdrnum elf_getshnum
# define elf_getshdrstrndx elf_getshstrndx
@@ -40,24 +49,27 @@ struct section {
struct section *base, *rsec;
struct symbol *sym;
Elf_Data *data;
- char *name;
+ const char *name;
int idx;
bool _changed, text, rodata, noinstr, init, truncate;
struct reloc *relocs;
+ unsigned long nr_alloc_relocs;
+ struct section *twin;
};
struct symbol {
struct list_head list;
+ struct list_head global_list;
struct rb_node node;
struct elf_hash_node hash;
struct elf_hash_node name_hash;
GElf_Sym sym;
struct section *sec;
- char *name;
+ const char *name, *demangled_name;
unsigned int idx, len;
unsigned long offset;
unsigned long __subtree_last;
- struct symbol *pfunc, *cfunc, *alias;
+ struct symbol *pfunc, *cfunc, *alias, *file;
unsigned char bind, type;
u8 uaccess_safe : 1;
u8 static_call_tramp : 1;
@@ -71,9 +83,17 @@ struct symbol {
u8 frame_pointer : 1;
u8 ignore : 1;
u8 nocfi : 1;
+ u8 cold : 1;
+ u8 prefix : 1;
+ u8 debug_checksum : 1;
+ u8 changed : 1;
+ u8 included : 1;
+ u8 klp : 1;
struct list_head pv_target;
struct reloc *relocs;
struct section *group_sec;
+ struct checksum csum;
+ struct symbol *twin, *clone;
};
struct reloc {
@@ -88,9 +108,10 @@ struct elf {
GElf_Ehdr ehdr;
int fd;
bool changed;
- char *name;
+ const char *name, *tmp_name;
unsigned int num_files;
struct list_head sections;
+ struct list_head symbols;
unsigned long num_relocs;
int symbol_bits;
@@ -110,14 +131,37 @@ struct elf {
};
struct elf *elf_open_read(const char *name, int flags);
+struct elf *elf_create_file(GElf_Ehdr *ehdr, const char *name);
struct section *elf_create_section(struct elf *elf, const char *name,
- size_t entsize, unsigned int nr);
+ size_t size, size_t entsize,
+ unsigned int type, unsigned int align,
+ unsigned int flags);
struct section *elf_create_section_pair(struct elf *elf, const char *name,
size_t entsize, unsigned int nr,
unsigned int reloc_nr);
-struct symbol *elf_create_prefix_symbol(struct elf *elf, struct symbol *orig, long size);
+struct section *elf_create_rela_section(struct elf *elf, struct section *sec,
+ unsigned int reloc_nr);
+
+struct symbol *elf_create_symbol(struct elf *elf, const char *name,
+ struct section *sec, unsigned int bind,
+ unsigned int type, unsigned long offset,
+ size_t size);
+struct symbol *elf_create_section_symbol(struct elf *elf, struct section *sec);
+
+void *elf_add_data(struct elf *elf, struct section *sec, const void *data,
+ size_t size);
+
+unsigned int elf_add_string(struct elf *elf, struct section *strtab, const char *str);
+
+struct reloc *elf_create_reloc(struct elf *elf, struct section *sec,
+ unsigned long offset, struct symbol *sym,
+ s64 addend, unsigned int type);
+
+struct reloc *elf_init_reloc(struct elf *elf, struct section *rsec,
+ unsigned int reloc_idx, unsigned long offset,
+ struct symbol *sym, s64 addend, unsigned int type);
struct reloc *elf_init_reloc_text_sym(struct elf *elf, struct section *sec,
unsigned long offset,
@@ -131,16 +175,17 @@ struct reloc *elf_init_reloc_data_sym(struct elf *elf, struct section *sec,
struct symbol *sym,
s64 addend);
-int elf_write_insn(struct elf *elf, struct section *sec,
- unsigned long offset, unsigned int len,
- const char *insn);
+int elf_write_insn(struct elf *elf, struct section *sec, unsigned long offset,
+ unsigned int len, const char *insn);
+
int elf_write(struct elf *elf);
-void elf_close(struct elf *elf);
+int elf_close(struct elf *elf);
struct section *find_section_by_name(const struct elf *elf, const char *name);
struct symbol *find_func_by_offset(struct section *sec, unsigned long offset);
struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset);
struct symbol *find_symbol_by_name(const struct elf *elf, const char *name);
+struct symbol *find_global_symbol_by_name(const struct elf *elf, const char *name);
struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset);
int find_symbol_hole_containing(const struct section *sec, unsigned long offset);
struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, unsigned long offset);
@@ -178,11 +223,76 @@ static inline unsigned int elf_text_rela_type(struct elf *elf)
return elf_addr_size(elf) == 4 ? R_TEXT32 : R_TEXT64;
}
+static inline bool is_undef_sym(struct symbol *sym)
+{
+ return !sym->sec->idx;
+}
+
+static inline bool is_null_sym(struct symbol *sym)
+{
+ return !sym->idx;
+}
+
+static inline bool is_sec_sym(struct symbol *sym)
+{
+ return sym->type == STT_SECTION;
+}
+
+static inline bool is_object_sym(struct symbol *sym)
+{
+ return sym->type == STT_OBJECT;
+}
+
+static inline bool is_func_sym(struct symbol *sym)
+{
+ return sym->type == STT_FUNC;
+}
+
+static inline bool is_file_sym(struct symbol *sym)
+{
+ return sym->type == STT_FILE;
+}
+
+static inline bool is_notype_sym(struct symbol *sym)
+{
+ return sym->type == STT_NOTYPE;
+}
+
+static inline bool is_global_sym(struct symbol *sym)
+{
+ return sym->bind == STB_GLOBAL;
+}
+
+static inline bool is_weak_sym(struct symbol *sym)
+{
+ return sym->bind == STB_WEAK;
+}
+
+static inline bool is_local_sym(struct symbol *sym)
+{
+ return sym->bind == STB_LOCAL;
+}
+
+static inline bool is_prefix_func(struct symbol *sym)
+{
+ return sym->prefix;
+}
+
static inline bool is_reloc_sec(struct section *sec)
{
return sec->sh.sh_type == SHT_RELA || sec->sh.sh_type == SHT_REL;
}
+static inline bool is_string_sec(struct section *sec)
+{
+ return sec->sh.sh_flags & SHF_STRINGS;
+}
+
+static inline bool is_text_sec(struct section *sec)
+{
+ return sec->sh.sh_flags & SHF_EXECINSTR;
+}
+
static inline bool sec_changed(struct section *sec)
{
return sec->_changed;
@@ -223,6 +333,11 @@ static inline bool is_32bit_reloc(struct reloc *reloc)
return reloc->sec->sh.sh_entsize < 16;
}
+static inline unsigned long sec_size(struct section *sec)
+{
+ return sec->sh.sh_size;
+}
+
#define __get_reloc_field(reloc, field) \
({ \
is_32bit_reloc(reloc) ? \
@@ -300,6 +415,15 @@ static inline void set_reloc_type(struct elf *elf, struct reloc *reloc, unsigned
mark_sec_changed(elf, reloc->sec, true);
}
+static inline unsigned int annotype(struct elf *elf, struct section *sec,
+ struct reloc *reloc)
+{
+ unsigned int type;
+
+ type = *(u32 *)(sec->data->d_buf + (reloc_idx(reloc) * 8) + 4);
+ return bswap_if_needed(elf, type);
+}
+
#define RELOC_JUMP_TABLE_BIT 1UL
/* Does reloc mark the beginning of a jump table? */
@@ -325,28 +449,54 @@ static inline void set_sym_next_reloc(struct reloc *reloc, struct reloc *next)
reloc->_sym_next_reloc = (unsigned long)next | bit;
}
-#define for_each_sec(file, sec) \
- list_for_each_entry(sec, &file->elf->sections, list)
+#define for_each_sec(elf, sec) \
+ list_for_each_entry(sec, &elf->sections, list)
#define sec_for_each_sym(sec, sym) \
list_for_each_entry(sym, &sec->symbol_list, list)
-#define for_each_sym(file, sym) \
- for (struct section *__sec, *__fake = (struct section *)1; \
- __fake; __fake = NULL) \
- for_each_sec(file, __sec) \
- sec_for_each_sym(__sec, sym)
+#define sec_prev_sym(sym) \
+ sym->sec && sym->list.prev != &sym->sec->symbol_list ? \
+ list_prev_entry(sym, list) : NULL
+
+#define for_each_sym(elf, sym) \
+ list_for_each_entry(sym, &elf->symbols, global_list)
+
+#define for_each_sym_continue(elf, sym) \
+ list_for_each_entry_continue(sym, &elf->symbols, global_list)
+
+#define rsec_next_reloc(rsec, reloc) \
+ reloc_idx(reloc) < sec_num_entries(rsec) - 1 ? reloc + 1 : NULL
#define for_each_reloc(rsec, reloc) \
- for (int __i = 0, __fake = 1; __fake; __fake = 0) \
- for (reloc = rsec->relocs; \
- __i < sec_num_entries(rsec); \
- __i++, reloc++)
+ for (reloc = rsec->relocs; reloc; reloc = rsec_next_reloc(rsec, reloc))
#define for_each_reloc_from(rsec, reloc) \
- for (int __i = reloc_idx(reloc); \
- __i < sec_num_entries(rsec); \
- __i++, reloc++)
+ for (; reloc; reloc = rsec_next_reloc(rsec, reloc))
+
+#define for_each_reloc_continue(rsec, reloc) \
+ for (reloc = rsec_next_reloc(rsec, reloc); reloc; \
+ reloc = rsec_next_reloc(rsec, reloc))
+
+#define sym_for_each_reloc(elf, sym, reloc) \
+ for (reloc = find_reloc_by_dest_range(elf, sym->sec, \
+ sym->offset, sym->len); \
+ reloc && reloc_offset(reloc) < sym->offset + sym->len; \
+ reloc = rsec_next_reloc(sym->sec->rsec, reloc))
+
+static inline struct symbol *get_func_prefix(struct symbol *func)
+{
+ struct symbol *prev;
+
+ if (!is_func_sym(func))
+ return NULL;
+
+ prev = sec_prev_sym(func);
+ if (prev && is_prefix_func(prev))
+ return prev;
+
+ return NULL;
+}
#define OFFSET_STRIDE_BITS 4
#define OFFSET_STRIDE (1UL << OFFSET_STRIDE_BITS)
diff --git a/tools/objtool/include/objtool/endianness.h b/tools/objtool/include/objtool/endianness.h
index 4d2aa9b0fe2f..aebcd2338668 100644
--- a/tools/objtool/include/objtool/endianness.h
+++ b/tools/objtool/include/objtool/endianness.h
@@ -4,7 +4,6 @@
#include <linux/kernel.h>
#include <endian.h>
-#include <objtool/elf.h>
/*
* Does a byte swap if target file endianness doesn't match the host, i.e. cross
@@ -12,16 +11,16 @@
* To be used for multi-byte values conversion, which are read from / about
* to be written to a target native endianness ELF file.
*/
-static inline bool need_bswap(struct elf *elf)
+static inline bool need_bswap(GElf_Ehdr *ehdr)
{
return (__BYTE_ORDER == __LITTLE_ENDIAN) ^
- (elf->ehdr.e_ident[EI_DATA] == ELFDATA2LSB);
+ (ehdr->e_ident[EI_DATA] == ELFDATA2LSB);
}
-#define bswap_if_needed(elf, val) \
+#define __bswap_if_needed(ehdr, val) \
({ \
__typeof__(val) __ret; \
- bool __need_bswap = need_bswap(elf); \
+ bool __need_bswap = need_bswap(ehdr); \
switch (sizeof(val)) { \
case 8: \
__ret = __need_bswap ? bswap_64(val) : (val); break; \
diff --git a/tools/objtool/include/objtool/klp.h b/tools/objtool/include/objtool/klp.h
new file mode 100644
index 000000000000..ad830a7ce55b
--- /dev/null
+++ b/tools/objtool/include/objtool/klp.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _OBJTOOL_KLP_H
+#define _OBJTOOL_KLP_H
+
+#define SHF_RELA_LIVEPATCH 0x00100000
+#define SHN_LIVEPATCH 0xff20
+
+/*
+ * __klp_objects and __klp_funcs are created by klp diff and used by the patch
+ * module init code to build the klp_patch, klp_object and klp_func structs
+ * needed by the livepatch API.
+ */
+#define KLP_OBJECTS_SEC "__klp_objects"
+#define KLP_FUNCS_SEC "__klp_funcs"
+
+/*
+ * __klp_relocs is an intermediate section which are created by klp diff and
+ * converted into KLP symbols/relas by "objtool klp post-link". This is needed
+ * to work around the linker, which doesn't preserve SHN_LIVEPATCH or
+ * SHF_RELA_LIVEPATCH, nor does it support having two RELA sections for a
+ * single PROGBITS section.
+ */
+#define KLP_RELOCS_SEC "__klp_relocs"
+#define KLP_STRINGS_SEC ".rodata.klp.str1.1"
+
+struct klp_reloc {
+ void *offset;
+ void *sym;
+ u32 type;
+};
+
+int cmd_klp_diff(int argc, const char **argv);
+int cmd_klp_post_link(int argc, const char **argv);
+
+#endif /* _OBJTOOL_KLP_H */
diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h
index c0dc86a78ff6..f7051bbe0bcb 100644
--- a/tools/objtool/include/objtool/objtool.h
+++ b/tools/objtool/include/objtool/objtool.h
@@ -28,7 +28,7 @@ struct objtool_file {
struct list_head mcount_loc_list;
struct list_head endbr_list;
struct list_head call_list;
- bool ignore_unreachables, hints, rodata;
+ bool ignore_unreachables, hints, rodata, klp;
unsigned int nr_endbr;
unsigned int nr_endbr_int;
@@ -39,6 +39,8 @@ struct objtool_file {
struct pv_state *pv_ops;
};
+char *top_level_dir(const char *file);
+
struct objtool_file *objtool_open_read(const char *_objname);
int objtool_pv_add(struct objtool_file *file, int idx, struct symbol *func);
diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h
index 72d09c0adf1a..121c3761899c 100644
--- a/tools/objtool/include/objtool/special.h
+++ b/tools/objtool/include/objtool/special.h
@@ -25,7 +25,7 @@ struct special_alt {
struct section *new_sec;
unsigned long new_off;
- unsigned int orig_len, new_len; /* group only */
+ unsigned int orig_len, new_len, feature; /* group only */
};
int special_get_alts(struct elf *elf, struct list_head *alts);
@@ -38,4 +38,6 @@ bool arch_support_alt_relocation(struct special_alt *special_alt,
struct reloc *arch_find_switch_table(struct objtool_file *file,
struct instruction *insn,
unsigned long *table_size);
+const char *arch_cpu_feature_name(int feature_number);
+
#endif /* _SPECIAL_H */
diff --git a/tools/objtool/include/objtool/trace.h b/tools/objtool/include/objtool/trace.h
new file mode 100644
index 000000000000..70b574366797
--- /dev/null
+++ b/tools/objtool/include/objtool/trace.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates.
+ */
+
+#ifndef _TRACE_H
+#define _TRACE_H
+
+#include <objtool/check.h>
+#include <objtool/disas.h>
+
+#ifdef DISAS
+
+extern bool trace;
+extern int trace_depth;
+
+#define TRACE(fmt, ...) \
+({ if (trace) \
+ fprintf(stderr, fmt, ##__VA_ARGS__); \
+})
+
+/*
+ * Print the instruction address and a message. The instruction
+ * itself is not printed.
+ */
+#define TRACE_ADDR(insn, fmt, ...) \
+({ \
+ if (trace) { \
+ disas_print_info(stderr, insn, trace_depth - 1, \
+ fmt "\n", ##__VA_ARGS__); \
+ } \
+})
+
+/*
+ * Print the instruction address, the instruction and a message.
+ */
+#define TRACE_INSN(insn, fmt, ...) \
+({ \
+ if (trace) { \
+ disas_print_insn(stderr, objtool_disas_ctx, \
+ insn, trace_depth - 1, \
+ fmt, ##__VA_ARGS__); \
+ fprintf(stderr, "\n"); \
+ insn->trace = 1; \
+ } \
+})
+
+#define TRACE_INSN_STATE(insn, sprev, snext) \
+({ \
+ if (trace) \
+ trace_insn_state(insn, sprev, snext); \
+})
+
+#define TRACE_ALT_FMT(pfx, fmt) pfx "<%s.%lx> " fmt
+#define TRACE_ALT_ARG(insn) disas_alt_type_name(insn), (insn)->offset
+
+#define TRACE_ALT(insn, fmt, ...) \
+ TRACE_INSN(insn, TRACE_ALT_FMT("", fmt), \
+ TRACE_ALT_ARG(insn), ##__VA_ARGS__)
+
+#define TRACE_ALT_INFO(insn, pfx, fmt, ...) \
+ TRACE_ADDR(insn, TRACE_ALT_FMT(pfx, fmt), \
+ TRACE_ALT_ARG(insn), ##__VA_ARGS__)
+
+#define TRACE_ALT_INFO_NOADDR(insn, pfx, fmt, ...) \
+ TRACE_ADDR(NULL, TRACE_ALT_FMT(pfx, fmt), \
+ TRACE_ALT_ARG(insn), ##__VA_ARGS__)
+
+#define TRACE_ALT_BEGIN(insn, alt, alt_name) \
+({ \
+ if (trace) { \
+ alt_name = disas_alt_name(alt); \
+ trace_alt_begin(insn, alt, alt_name); \
+ } \
+})
+
+#define TRACE_ALT_END(insn, alt, alt_name) \
+({ \
+ if (trace) { \
+ trace_alt_end(insn, alt, alt_name); \
+ free(alt_name); \
+ } \
+})
+
+static inline void trace_enable(void)
+{
+ trace = true;
+ trace_depth = 0;
+}
+
+static inline void trace_disable(void)
+{
+ trace = false;
+}
+
+static inline void trace_depth_inc(void)
+{
+ if (trace)
+ trace_depth++;
+}
+
+static inline void trace_depth_dec(void)
+{
+ if (trace)
+ trace_depth--;
+}
+
+void trace_insn_state(struct instruction *insn, struct insn_state *sprev,
+ struct insn_state *snext);
+void trace_alt_begin(struct instruction *orig_insn, struct alternative *alt,
+ char *alt_name);
+void trace_alt_end(struct instruction *orig_insn, struct alternative *alt,
+ char *alt_name);
+
+#else /* DISAS */
+
+#define TRACE(fmt, ...) ({})
+#define TRACE_ADDR(insn, fmt, ...) ({})
+#define TRACE_INSN(insn, fmt, ...) ({})
+#define TRACE_INSN_STATE(insn, sprev, snext) ({})
+#define TRACE_ALT(insn, fmt, ...) ({})
+#define TRACE_ALT_INFO(insn, fmt, ...) ({})
+#define TRACE_ALT_INFO_NOADDR(insn, fmt, ...) ({})
+#define TRACE_ALT_BEGIN(insn, alt, alt_name) ({})
+#define TRACE_ALT_END(insn, alt, alt_name) ({})
+
+
+static inline void trace_enable(void) {}
+static inline void trace_disable(void) {}
+static inline void trace_depth_inc(void) {}
+static inline void trace_depth_dec(void) {}
+static inline void trace_alt_begin(struct instruction *orig_insn,
+ struct alternative *alt,
+ char *alt_name) {};
+static inline void trace_alt_end(struct instruction *orig_insn,
+ struct alternative *alt,
+ char *alt_name) {};
+
+#endif
+
+#endif /* _TRACE_H */
diff --git a/tools/objtool/include/objtool/util.h b/tools/objtool/include/objtool/util.h
new file mode 100644
index 000000000000..a0180b312f73
--- /dev/null
+++ b/tools/objtool/include/objtool/util.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _UTIL_H
+#define _UTIL_H
+
+#include <objtool/warn.h>
+
+#define snprintf_check(str, size, format, args...) \
+({ \
+ int __ret = snprintf(str, size, format, args); \
+ if (__ret < 0) \
+ ERROR_GLIBC("snprintf"); \
+ else if (__ret >= size) \
+ ERROR("snprintf() failed for '" format "'", args); \
+ else \
+ __ret = 0; \
+ __ret; \
+})
+
+#endif /* _UTIL_H */
diff --git a/tools/objtool/include/objtool/warn.h b/tools/objtool/include/objtool/warn.h
index cb8fe846d9dd..25ff7942b4d5 100644
--- a/tools/objtool/include/objtool/warn.h
+++ b/tools/objtool/include/objtool/warn.h
@@ -77,9 +77,11 @@ static inline char *offstr(struct section *sec, unsigned long offset)
#define WARN_INSN(insn, format, ...) \
({ \
struct instruction *_insn = (insn); \
- if (!_insn->sym || !_insn->sym->warned) \
+ if (!_insn->sym || !_insn->sym->warned) { \
WARN_FUNC(_insn->sec, _insn->offset, format, \
##__VA_ARGS__); \
+ BT_INSN(_insn, ""); \
+ } \
if (_insn->sym) \
_insn->sym->warned = 1; \
})
@@ -87,10 +89,15 @@ static inline char *offstr(struct section *sec, unsigned long offset)
#define BT_INSN(insn, format, ...) \
({ \
if (opts.verbose || opts.backtrace) { \
- struct instruction *_insn = (insn); \
- char *_str = offstr(_insn->sec, _insn->offset); \
- WARN(" %s: " format, _str, ##__VA_ARGS__); \
- free(_str); \
+ struct instruction *__insn = (insn); \
+ char *_str = offstr(__insn->sec, __insn->offset); \
+ const char *_istr = objtool_disas_insn(__insn); \
+ int _len; \
+ _len = snprintf(NULL, 0, " %s: " format, _str, ##__VA_ARGS__); \
+ _len = (_len < 50) ? 50 - _len : 0; \
+ WARN(" %s: " format " %*s%s", _str, ##__VA_ARGS__, _len, "", _istr); \
+ free(_str); \
+ __insn->trace = 1; \
} \
})
@@ -102,4 +109,53 @@ static inline char *offstr(struct section *sec, unsigned long offset)
#define ERROR_FUNC(sec, offset, format, ...) __WARN_FUNC(ERROR_STR, sec, offset, format, ##__VA_ARGS__)
#define ERROR_INSN(insn, format, ...) WARN_FUNC(insn->sec, insn->offset, format, ##__VA_ARGS__)
+extern bool debug;
+extern int indent;
+
+static inline void unindent(int *unused) { indent--; }
+
+/*
+ * Clang prior to 17 is being silly and considers many __cleanup() variables
+ * as unused (because they are, their sole purpose is to go out of scope).
+ *
+ * https://github.com/llvm/llvm-project/commit/877210faa447f4cc7db87812f8ed80e398fedd61
+ */
+#undef __cleanup
+#define __cleanup(func) __maybe_unused __attribute__((__cleanup__(func)))
+
+#define __dbg(format, ...) \
+ fprintf(stderr, \
+ "DEBUG: %s%s" format "\n", \
+ objname ?: "", \
+ objname ? ": " : "", \
+ ##__VA_ARGS__)
+
+#define dbg(args...) \
+({ \
+ if (unlikely(debug)) \
+ __dbg(args); \
+})
+
+#define __dbg_indent(format, ...) \
+({ \
+ if (unlikely(debug)) \
+ __dbg("%*s" format, indent * 8, "", ##__VA_ARGS__); \
+})
+
+#define dbg_indent(args...) \
+ int __cleanup(unindent) __dummy_##__COUNTER__; \
+ __dbg_indent(args); \
+ indent++
+
+#define dbg_checksum(func, insn, checksum) \
+({ \
+ if (unlikely(insn->sym && insn->sym->pfunc && \
+ insn->sym->pfunc->debug_checksum)) { \
+ char *insn_off = offstr(insn->sec, insn->offset); \
+ __dbg("checksum: %s %s %016lx", \
+ func->name, insn_off, checksum); \
+ free(insn_off); \
+ } \
+})
+
#endif /* _WARN_H */
diff --git a/tools/objtool/klp-diff.c b/tools/objtool/klp-diff.c
new file mode 100644
index 000000000000..4d1f9e9977eb
--- /dev/null
+++ b/tools/objtool/klp-diff.c
@@ -0,0 +1,1723 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#define _GNU_SOURCE /* memmem() */
+#include <subcmd/parse-options.h>
+#include <stdlib.h>
+#include <string.h>
+#include <libgen.h>
+#include <stdio.h>
+#include <ctype.h>
+
+#include <objtool/objtool.h>
+#include <objtool/warn.h>
+#include <objtool/arch.h>
+#include <objtool/klp.h>
+#include <objtool/util.h>
+#include <arch/special.h>
+
+#include <linux/objtool_types.h>
+#include <linux/livepatch_external.h>
+#include <linux/stringify.h>
+#include <linux/string.h>
+#include <linux/jhash.h>
+
+#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
+
+struct elfs {
+ struct elf *orig, *patched, *out;
+ const char *modname;
+};
+
+struct export {
+ struct hlist_node hash;
+ char *mod, *sym;
+};
+
+static const char * const klp_diff_usage[] = {
+ "objtool klp diff [<options>] <in1.o> <in2.o> <out.o>",
+ NULL,
+};
+
+static const struct option klp_diff_options[] = {
+ OPT_GROUP("Options:"),
+ OPT_BOOLEAN('d', "debug", &debug, "enable debug output"),
+ OPT_END(),
+};
+
+static DEFINE_HASHTABLE(exports, 15);
+
+static inline u32 str_hash(const char *str)
+{
+ return jhash(str, strlen(str), 0);
+}
+
+static char *escape_str(const char *orig)
+{
+ size_t len = 0;
+ const char *a;
+ char *b, *new;
+
+ for (a = orig; *a; a++) {
+ switch (*a) {
+ case '\001': len += 5; break;
+ case '\n':
+ case '\t': len += 2; break;
+ default: len++;
+ }
+ }
+
+ new = malloc(len + 1);
+ if (!new)
+ return NULL;
+
+ for (a = orig, b = new; *a; a++) {
+ switch (*a) {
+ case '\001': memcpy(b, "<SOH>", 5); b += 5; break;
+ case '\n': *b++ = '\\'; *b++ = 'n'; break;
+ case '\t': *b++ = '\\'; *b++ = 't'; break;
+ default: *b++ = *a;
+ }
+ }
+
+ *b = '\0';
+ return new;
+}
+
+static int read_exports(void)
+{
+ const char *symvers = "Module.symvers";
+ char line[1024], *path = NULL;
+ unsigned int line_num = 1;
+ FILE *file;
+
+ file = fopen(symvers, "r");
+ if (!file) {
+ path = top_level_dir(symvers);
+ if (!path) {
+ ERROR("can't open '%s', \"objtool diff\" should be run from the kernel tree", symvers);
+ return -1;
+ }
+
+ file = fopen(path, "r");
+ if (!file) {
+ ERROR_GLIBC("fopen");
+ return -1;
+ }
+ }
+
+ while (fgets(line, 1024, file)) {
+ char *sym, *mod, *type;
+ struct export *export;
+
+ sym = strchr(line, '\t');
+ if (!sym) {
+ ERROR("malformed Module.symvers (sym) at line %d", line_num);
+ return -1;
+ }
+
+ *sym++ = '\0';
+
+ mod = strchr(sym, '\t');
+ if (!mod) {
+ ERROR("malformed Module.symvers (mod) at line %d", line_num);
+ return -1;
+ }
+
+ *mod++ = '\0';
+
+ type = strchr(mod, '\t');
+ if (!type) {
+ ERROR("malformed Module.symvers (type) at line %d", line_num);
+ return -1;
+ }
+
+ *type++ = '\0';
+
+ if (*sym == '\0' || *mod == '\0') {
+ ERROR("malformed Module.symvers at line %d", line_num);
+ return -1;
+ }
+
+ export = calloc(1, sizeof(*export));
+ if (!export) {
+ ERROR_GLIBC("calloc");
+ return -1;
+ }
+
+ export->mod = strdup(mod);
+ if (!export->mod) {
+ ERROR_GLIBC("strdup");
+ return -1;
+ }
+
+ export->sym = strdup(sym);
+ if (!export->sym) {
+ ERROR_GLIBC("strdup");
+ return -1;
+ }
+
+ hash_add(exports, &export->hash, str_hash(sym));
+ }
+
+ free(path);
+ fclose(file);
+
+ return 0;
+}
+
+static int read_sym_checksums(struct elf *elf)
+{
+ struct section *sec;
+
+ sec = find_section_by_name(elf, ".discard.sym_checksum");
+ if (!sec) {
+ ERROR("'%s' missing .discard.sym_checksum section, file not processed by 'objtool --checksum'?",
+ elf->name);
+ return -1;
+ }
+
+ if (!sec->rsec) {
+ ERROR("missing reloc section for .discard.sym_checksum");
+ return -1;
+ }
+
+ if (sec_size(sec) % sizeof(struct sym_checksum)) {
+ ERROR("struct sym_checksum size mismatch");
+ return -1;
+ }
+
+ for (int i = 0; i < sec_size(sec) / sizeof(struct sym_checksum); i++) {
+ struct sym_checksum *sym_checksum;
+ struct reloc *reloc;
+ struct symbol *sym;
+
+ sym_checksum = (struct sym_checksum *)sec->data->d_buf + i;
+
+ reloc = find_reloc_by_dest(elf, sec, i * sizeof(*sym_checksum));
+ if (!reloc) {
+ ERROR("can't find reloc for sym_checksum[%d]", i);
+ return -1;
+ }
+
+ sym = reloc->sym;
+
+ if (is_sec_sym(sym)) {
+ ERROR("not sure how to handle section %s", sym->name);
+ return -1;
+ }
+
+ if (is_func_sym(sym))
+ sym->csum.checksum = sym_checksum->checksum;
+ }
+
+ return 0;
+}
+
+static struct symbol *first_file_symbol(struct elf *elf)
+{
+ struct symbol *sym;
+
+ for_each_sym(elf, sym) {
+ if (is_file_sym(sym))
+ return sym;
+ }
+
+ return NULL;
+}
+
+static struct symbol *next_file_symbol(struct elf *elf, struct symbol *sym)
+{
+ for_each_sym_continue(elf, sym) {
+ if (is_file_sym(sym))
+ return sym;
+ }
+
+ return NULL;
+}
+
+/*
+ * Certain static local variables should never be correlated. They will be
+ * used in place rather than referencing the originals.
+ */
+static bool is_uncorrelated_static_local(struct symbol *sym)
+{
+ static const char * const vars[] = {
+ "__already_done.",
+ "__func__.",
+ "__key.",
+ "__warned.",
+ "_entry.",
+ "_entry_ptr.",
+ "_rs.",
+ "descriptor.",
+ "CSWTCH.",
+ };
+
+ if (!is_object_sym(sym) || !is_local_sym(sym))
+ return false;
+
+ if (!strcmp(sym->sec->name, ".data.once"))
+ return true;
+
+ for (int i = 0; i < ARRAY_SIZE(vars); i++) {
+ if (strstarts(sym->name, vars[i]))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Clang emits several useless .Ltmp_* code labels.
+ */
+static bool is_clang_tmp_label(struct symbol *sym)
+{
+ return sym->type == STT_NOTYPE &&
+ is_text_sec(sym->sec) &&
+ strstarts(sym->name, ".Ltmp") &&
+ isdigit(sym->name[5]);
+}
+
+static bool is_special_section(struct section *sec)
+{
+ static const char * const specials[] = {
+ ".altinstructions",
+ ".smp_locks",
+ "__bug_table",
+ "__ex_table",
+ "__jump_table",
+ "__mcount_loc",
+
+ /*
+ * Extract .static_call_sites here to inherit non-module
+ * preferential treatment. The later static call processing
+ * during klp module build will be skipped when it sees this
+ * section already exists.
+ */
+ ".static_call_sites",
+ };
+
+ static const char * const non_special_discards[] = {
+ ".discard.addressable",
+ ".discard.sym_checksum",
+ };
+
+ if (is_text_sec(sec))
+ return false;
+
+ for (int i = 0; i < ARRAY_SIZE(specials); i++) {
+ if (!strcmp(sec->name, specials[i]))
+ return true;
+ }
+
+ /* Most .discard data sections are special */
+ for (int i = 0; i < ARRAY_SIZE(non_special_discards); i++) {
+ if (!strcmp(sec->name, non_special_discards[i]))
+ return false;
+ }
+
+ return strstarts(sec->name, ".discard.");
+}
+
+/*
+ * These sections are referenced by special sections but aren't considered
+ * special sections themselves.
+ */
+static bool is_special_section_aux(struct section *sec)
+{
+ static const char * const specials_aux[] = {
+ ".altinstr_replacement",
+ ".altinstr_aux",
+ };
+
+ for (int i = 0; i < ARRAY_SIZE(specials_aux); i++) {
+ if (!strcmp(sec->name, specials_aux[i]))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * These symbols should never be correlated, so their local patched versions
+ * are used instead of linking to the originals.
+ */
+static bool dont_correlate(struct symbol *sym)
+{
+ return is_file_sym(sym) ||
+ is_null_sym(sym) ||
+ is_sec_sym(sym) ||
+ is_prefix_func(sym) ||
+ is_uncorrelated_static_local(sym) ||
+ is_clang_tmp_label(sym) ||
+ is_string_sec(sym->sec) ||
+ is_special_section(sym->sec) ||
+ is_special_section_aux(sym->sec) ||
+ strstarts(sym->name, "__initcall__");
+}
+
+/*
+ * For each symbol in the original kernel, find its corresponding "twin" in the
+ * patched kernel.
+ */
+static int correlate_symbols(struct elfs *e)
+{
+ struct symbol *file1_sym, *file2_sym;
+ struct symbol *sym1, *sym2;
+
+ /* Correlate locals */
+ for (file1_sym = first_file_symbol(e->orig),
+ file2_sym = first_file_symbol(e->patched); ;
+ file1_sym = next_file_symbol(e->orig, file1_sym),
+ file2_sym = next_file_symbol(e->patched, file2_sym)) {
+
+ if (!file1_sym && file2_sym) {
+ ERROR("FILE symbol mismatch: NULL != %s", file2_sym->name);
+ return -1;
+ }
+
+ if (file1_sym && !file2_sym) {
+ ERROR("FILE symbol mismatch: %s != NULL", file1_sym->name);
+ return -1;
+ }
+
+ if (!file1_sym)
+ break;
+
+ if (strcmp(file1_sym->name, file2_sym->name)) {
+ ERROR("FILE symbol mismatch: %s != %s", file1_sym->name, file2_sym->name);
+ return -1;
+ }
+
+ file1_sym->twin = file2_sym;
+ file2_sym->twin = file1_sym;
+
+ sym1 = file1_sym;
+
+ for_each_sym_continue(e->orig, sym1) {
+ if (is_file_sym(sym1) || !is_local_sym(sym1))
+ break;
+
+ if (dont_correlate(sym1))
+ continue;
+
+ sym2 = file2_sym;
+ for_each_sym_continue(e->patched, sym2) {
+ if (is_file_sym(sym2) || !is_local_sym(sym2))
+ break;
+
+ if (sym2->twin || dont_correlate(sym2))
+ continue;
+
+ if (strcmp(sym1->demangled_name, sym2->demangled_name))
+ continue;
+
+ sym1->twin = sym2;
+ sym2->twin = sym1;
+ break;
+ }
+ }
+ }
+
+ /* Correlate globals */
+ for_each_sym(e->orig, sym1) {
+ if (sym1->bind == STB_LOCAL)
+ continue;
+
+ sym2 = find_global_symbol_by_name(e->patched, sym1->name);
+
+ if (sym2 && !sym2->twin && !strcmp(sym1->name, sym2->name)) {
+ sym1->twin = sym2;
+ sym2->twin = sym1;
+ }
+ }
+
+ for_each_sym(e->orig, sym1) {
+ if (sym1->twin || dont_correlate(sym1))
+ continue;
+ WARN("no correlation: %s", sym1->name);
+ }
+
+ return 0;
+}
+
+/* "sympos" is used by livepatch to disambiguate duplicate symbol names */
+static unsigned long find_sympos(struct elf *elf, struct symbol *sym)
+{
+ bool vmlinux = str_ends_with(objname, "vmlinux.o");
+ unsigned long sympos = 0, nr_matches = 0;
+ bool has_dup = false;
+ struct symbol *s;
+
+ if (sym->bind != STB_LOCAL)
+ return 0;
+
+ if (vmlinux && sym->type == STT_FUNC) {
+ /*
+ * HACK: Unfortunately, symbol ordering can differ between
+ * vmlinux.o and vmlinux due to the linker script emitting
+ * .text.unlikely* before .text*. Count .text.unlikely* first.
+ *
+ * TODO: Disambiguate symbols more reliably (checksums?)
+ */
+ for_each_sym(elf, s) {
+ if (strstarts(s->sec->name, ".text.unlikely") &&
+ !strcmp(s->name, sym->name)) {
+ nr_matches++;
+ if (s == sym)
+ sympos = nr_matches;
+ else
+ has_dup = true;
+ }
+ }
+ for_each_sym(elf, s) {
+ if (!strstarts(s->sec->name, ".text.unlikely") &&
+ !strcmp(s->name, sym->name)) {
+ nr_matches++;
+ if (s == sym)
+ sympos = nr_matches;
+ else
+ has_dup = true;
+ }
+ }
+ } else {
+ for_each_sym(elf, s) {
+ if (!strcmp(s->name, sym->name)) {
+ nr_matches++;
+ if (s == sym)
+ sympos = nr_matches;
+ else
+ has_dup = true;
+ }
+ }
+ }
+
+ if (!sympos) {
+ ERROR("can't find sympos for %s", sym->name);
+ return ULONG_MAX;
+ }
+
+ return has_dup ? sympos : 0;
+}
+
+static int clone_sym_relocs(struct elfs *e, struct symbol *patched_sym);
+
+static struct symbol *__clone_symbol(struct elf *elf, struct symbol *patched_sym,
+ bool data_too)
+{
+ struct section *out_sec = NULL;
+ unsigned long offset = 0;
+ struct symbol *out_sym;
+
+ if (data_too && !is_undef_sym(patched_sym)) {
+ struct section *patched_sec = patched_sym->sec;
+
+ out_sec = find_section_by_name(elf, patched_sec->name);
+ if (!out_sec) {
+ out_sec = elf_create_section(elf, patched_sec->name, 0,
+ patched_sec->sh.sh_entsize,
+ patched_sec->sh.sh_type,
+ patched_sec->sh.sh_addralign,
+ patched_sec->sh.sh_flags);
+ if (!out_sec)
+ return NULL;
+ }
+
+ if (is_string_sec(patched_sym->sec)) {
+ out_sym = elf_create_section_symbol(elf, out_sec);
+ if (!out_sym)
+ return NULL;
+
+ goto sym_created;
+ }
+
+ if (!is_sec_sym(patched_sym))
+ offset = sec_size(out_sec);
+
+ if (patched_sym->len || is_sec_sym(patched_sym)) {
+ void *data = NULL;
+ size_t size;
+
+ /* bss doesn't have data */
+ if (patched_sym->sec->data->d_buf)
+ data = patched_sym->sec->data->d_buf + patched_sym->offset;
+
+ if (is_sec_sym(patched_sym))
+ size = sec_size(patched_sym->sec);
+ else
+ size = patched_sym->len;
+
+ if (!elf_add_data(elf, out_sec, data, size))
+ return NULL;
+ }
+ }
+
+ out_sym = elf_create_symbol(elf, patched_sym->name, out_sec,
+ patched_sym->bind, patched_sym->type,
+ offset, patched_sym->len);
+ if (!out_sym)
+ return NULL;
+
+sym_created:
+ patched_sym->clone = out_sym;
+ out_sym->clone = patched_sym;
+
+ return out_sym;
+}
+
+static const char *sym_type(struct symbol *sym)
+{
+ switch (sym->type) {
+ case STT_NOTYPE: return "NOTYPE";
+ case STT_OBJECT: return "OBJECT";
+ case STT_FUNC: return "FUNC";
+ case STT_SECTION: return "SECTION";
+ case STT_FILE: return "FILE";
+ default: return "UNKNOWN";
+ }
+}
+
+static const char *sym_bind(struct symbol *sym)
+{
+ switch (sym->bind) {
+ case STB_LOCAL: return "LOCAL";
+ case STB_GLOBAL: return "GLOBAL";
+ case STB_WEAK: return "WEAK";
+ default: return "UNKNOWN";
+ }
+}
+
+/*
+ * Copy a symbol to the output object, optionally including its data and
+ * relocations.
+ */
+static struct symbol *clone_symbol(struct elfs *e, struct symbol *patched_sym,
+ bool data_too)
+{
+ struct symbol *pfx;
+
+ if (patched_sym->clone)
+ return patched_sym->clone;
+
+ dbg_indent("%s%s", patched_sym->name, data_too ? " [+DATA]" : "");
+
+ /* Make sure the prefix gets cloned first */
+ if (is_func_sym(patched_sym) && data_too) {
+ pfx = get_func_prefix(patched_sym);
+ if (pfx)
+ clone_symbol(e, pfx, true);
+ }
+
+ if (!__clone_symbol(e->out, patched_sym, data_too))
+ return NULL;
+
+ if (data_too && clone_sym_relocs(e, patched_sym))
+ return NULL;
+
+ return patched_sym->clone;
+}
+
+static void mark_included_function(struct symbol *func)
+{
+ struct symbol *pfx;
+
+ func->included = 1;
+
+ /* Include prefix function */
+ pfx = get_func_prefix(func);
+ if (pfx)
+ pfx->included = 1;
+
+ /* Make sure .cold parent+child always stay together */
+ if (func->cfunc && func->cfunc != func)
+ func->cfunc->included = 1;
+ if (func->pfunc && func->pfunc != func)
+ func->pfunc->included = 1;
+}
+
+/*
+ * Copy all changed functions (and their dependencies) from the patched object
+ * to the output object.
+ */
+static int mark_changed_functions(struct elfs *e)
+{
+ struct symbol *sym_orig, *patched_sym;
+ bool changed = false;
+
+ /* Find changed functions */
+ for_each_sym(e->orig, sym_orig) {
+ if (!is_func_sym(sym_orig) || is_prefix_func(sym_orig))
+ continue;
+
+ patched_sym = sym_orig->twin;
+ if (!patched_sym)
+ continue;
+
+ if (sym_orig->csum.checksum != patched_sym->csum.checksum) {
+ patched_sym->changed = 1;
+ mark_included_function(patched_sym);
+ changed = true;
+ }
+ }
+
+ /* Find added functions and print them */
+ for_each_sym(e->patched, patched_sym) {
+ if (!is_func_sym(patched_sym) || is_prefix_func(patched_sym))
+ continue;
+
+ if (!patched_sym->twin) {
+ printf("%s: new function: %s\n", objname, patched_sym->name);
+ mark_included_function(patched_sym);
+ changed = true;
+ }
+ }
+
+ /* Print changed functions */
+ for_each_sym(e->patched, patched_sym) {
+ if (patched_sym->changed)
+ printf("%s: changed function: %s\n", objname, patched_sym->name);
+ }
+
+ return !changed ? -1 : 0;
+}
+
+static int clone_included_functions(struct elfs *e)
+{
+ struct symbol *patched_sym;
+
+ for_each_sym(e->patched, patched_sym) {
+ if (patched_sym->included) {
+ if (!clone_symbol(e, patched_sym, true))
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Determine whether a relocation should reference the section rather than the
+ * underlying symbol.
+ */
+static bool section_reference_needed(struct section *sec)
+{
+ /*
+ * String symbols are zero-length and uncorrelated. It's easier to
+ * deal with them as section symbols.
+ */
+ if (is_string_sec(sec))
+ return true;
+
+ /*
+ * .rodata has mostly anonymous data so there's no way to determine the
+ * length of a needed reference. just copy the whole section if needed.
+ */
+ if (strstarts(sec->name, ".rodata"))
+ return true;
+
+ /* UBSAN anonymous data */
+ if (strstarts(sec->name, ".data..Lubsan") || /* GCC */
+ strstarts(sec->name, ".data..L__unnamed_")) /* Clang */
+ return true;
+
+ return false;
+}
+
+static bool is_reloc_allowed(struct reloc *reloc)
+{
+ return section_reference_needed(reloc->sym->sec) == is_sec_sym(reloc->sym);
+}
+
+static struct export *find_export(struct symbol *sym)
+{
+ struct export *export;
+
+ hash_for_each_possible(exports, export, hash, str_hash(sym->name)) {
+ if (!strcmp(export->sym, sym->name))
+ return export;
+ }
+
+ return NULL;
+}
+
+static const char *__find_modname(struct elfs *e)
+{
+ struct section *sec;
+ char *name;
+
+ sec = find_section_by_name(e->orig, ".modinfo");
+ if (!sec) {
+ ERROR("missing .modinfo section");
+ return NULL;
+ }
+
+ name = memmem(sec->data->d_buf, sec_size(sec), "\0name=", 6);
+ if (name)
+ return name + 6;
+
+ name = strdup(e->orig->name);
+ if (!name) {
+ ERROR_GLIBC("strdup");
+ return NULL;
+ }
+
+ for (char *c = name; *c; c++) {
+ if (*c == '/')
+ name = c + 1;
+ else if (*c == '-')
+ *c = '_';
+ else if (*c == '.') {
+ *c = '\0';
+ break;
+ }
+ }
+
+ return name;
+}
+
+/* Get the object's module name as defined by the kernel (and klp_object) */
+static const char *find_modname(struct elfs *e)
+{
+ const char *modname;
+
+ if (e->modname)
+ return e->modname;
+
+ modname = __find_modname(e);
+ e->modname = modname;
+ return modname;
+}
+
+/*
+ * Copying a function from its native compiled environment to a kernel module
+ * removes its natural access to local functions/variables and unexported
+ * globals. References to such symbols need to be converted to KLP relocs so
+ * the kernel arch relocation code knows to apply them and where to find the
+ * symbols. Particularly, duplicate static symbols need to be disambiguated.
+ */
+static bool klp_reloc_needed(struct reloc *patched_reloc)
+{
+ struct symbol *patched_sym = patched_reloc->sym;
+ struct export *export;
+
+ /* no external symbol to reference */
+ if (dont_correlate(patched_sym))
+ return false;
+
+ /* For included functions, a regular reloc will do. */
+ if (patched_sym->included)
+ return false;
+
+ /*
+ * If exported by a module, it has to be a klp reloc. Thanks to the
+ * clusterfunk that is late module patching, the patch module is
+ * allowed to be loaded before any modules it depends on.
+ *
+ * If exported by vmlinux, a normal reloc will do.
+ */
+ export = find_export(patched_sym);
+ if (export)
+ return strcmp(export->mod, "vmlinux");
+
+ if (!patched_sym->twin) {
+ /*
+ * Presumably the symbol and its reference were added by the
+ * patch. The symbol could be defined in this .o or in another
+ * .o in the patch module.
+ *
+ * This check needs to be *after* the export check due to the
+ * possibility of the patch adding a new UNDEF reference to an
+ * exported symbol.
+ */
+ return false;
+ }
+
+ /* Unexported symbol which lives in the original vmlinux or module. */
+ return true;
+}
+
+static int convert_reloc_sym_to_secsym(struct elf *elf, struct reloc *reloc)
+{
+ struct symbol *sym = reloc->sym;
+ struct section *sec = sym->sec;
+
+ if (!sec->sym && !elf_create_section_symbol(elf, sec))
+ return -1;
+
+ reloc->sym = sec->sym;
+ set_reloc_sym(elf, reloc, sym->idx);
+ set_reloc_addend(elf, reloc, sym->offset + reloc_addend(reloc));
+ return 0;
+}
+
+static int convert_reloc_secsym_to_sym(struct elf *elf, struct reloc *reloc)
+{
+ struct symbol *sym = reloc->sym;
+ struct section *sec = sym->sec;
+
+ /* If the symbol has a dedicated section, it's easy to find */
+ sym = find_symbol_by_offset(sec, 0);
+ if (sym && sym->len == sec_size(sec))
+ goto found_sym;
+
+ /* No dedicated section; find the symbol manually */
+ sym = find_symbol_containing(sec, arch_adjusted_addend(reloc));
+ if (!sym) {
+ /*
+ * This can happen for special section references to weak code
+ * whose symbol has been stripped by the linker.
+ */
+ return -1;
+ }
+
+found_sym:
+ reloc->sym = sym;
+ set_reloc_sym(elf, reloc, sym->idx);
+ set_reloc_addend(elf, reloc, reloc_addend(reloc) - sym->offset);
+ return 0;
+}
+
+/*
+ * Convert a relocation symbol reference to the needed format: either a section
+ * symbol or the underlying symbol itself.
+ */
+static int convert_reloc_sym(struct elf *elf, struct reloc *reloc)
+{
+ if (is_reloc_allowed(reloc))
+ return 0;
+
+ if (section_reference_needed(reloc->sym->sec))
+ return convert_reloc_sym_to_secsym(elf, reloc);
+ else
+ return convert_reloc_secsym_to_sym(elf, reloc);
+}
+
+/*
+ * Convert a regular relocation to a klp relocation (sort of).
+ */
+static int clone_reloc_klp(struct elfs *e, struct reloc *patched_reloc,
+ struct section *sec, unsigned long offset,
+ struct export *export)
+{
+ struct symbol *patched_sym = patched_reloc->sym;
+ s64 addend = reloc_addend(patched_reloc);
+ const char *sym_modname, *sym_orig_name;
+ static struct section *klp_relocs;
+ struct symbol *sym, *klp_sym;
+ unsigned long klp_reloc_off;
+ char sym_name[SYM_NAME_LEN];
+ struct klp_reloc klp_reloc;
+ unsigned long sympos;
+
+ if (!patched_sym->twin) {
+ ERROR("unexpected klp reloc for new symbol %s", patched_sym->name);
+ return -1;
+ }
+
+ /*
+ * Keep the original reloc intact for now to avoid breaking objtool run
+ * which relies on proper relocations for many of its features. This
+ * will be disabled later by "objtool klp post-link".
+ *
+ * Convert it to UNDEF (and WEAK to avoid modpost warnings).
+ */
+
+ sym = patched_sym->clone;
+ if (!sym) {
+ /* STB_WEAK: avoid modpost undefined symbol warnings */
+ sym = elf_create_symbol(e->out, patched_sym->name, NULL,
+ STB_WEAK, patched_sym->type, 0, 0);
+ if (!sym)
+ return -1;
+
+ patched_sym->clone = sym;
+ sym->clone = patched_sym;
+ }
+
+ if (!elf_create_reloc(e->out, sec, offset, sym, addend, reloc_type(patched_reloc)))
+ return -1;
+
+ /*
+ * Create the KLP symbol.
+ */
+
+ if (export) {
+ sym_modname = export->mod;
+ sym_orig_name = export->sym;
+ sympos = 0;
+ } else {
+ sym_modname = find_modname(e);
+ if (!sym_modname)
+ return -1;
+
+ sym_orig_name = patched_sym->twin->name;
+ sympos = find_sympos(e->orig, patched_sym->twin);
+ if (sympos == ULONG_MAX)
+ return -1;
+ }
+
+ /* symbol format: .klp.sym.modname.sym_name,sympos */
+ if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_SYM_PREFIX "%s.%s,%ld",
+ sym_modname, sym_orig_name, sympos))
+ return -1;
+
+ klp_sym = find_symbol_by_name(e->out, sym_name);
+ if (!klp_sym) {
+ __dbg_indent("%s", sym_name);
+
+ /* STB_WEAK: avoid modpost undefined symbol warnings */
+ klp_sym = elf_create_symbol(e->out, sym_name, NULL,
+ STB_WEAK, patched_sym->type, 0, 0);
+ if (!klp_sym)
+ return -1;
+ }
+
+ /*
+ * Create the __klp_relocs entry. This will be converted to an actual
+ * KLP rela by "objtool klp post-link".
+ *
+ * This intermediate step is necessary to prevent corruption by the
+ * linker, which doesn't know how to properly handle two rela sections
+ * applying to the same base section.
+ */
+
+ if (!klp_relocs) {
+ klp_relocs = elf_create_section(e->out, KLP_RELOCS_SEC, 0,
+ 0, SHT_PROGBITS, 8, SHF_ALLOC);
+ if (!klp_relocs)
+ return -1;
+ }
+
+ klp_reloc_off = sec_size(klp_relocs);
+ memset(&klp_reloc, 0, sizeof(klp_reloc));
+
+ klp_reloc.type = reloc_type(patched_reloc);
+ if (!elf_add_data(e->out, klp_relocs, &klp_reloc, sizeof(klp_reloc)))
+ return -1;
+
+ /* klp_reloc.offset */
+ if (!sec->sym && !elf_create_section_symbol(e->out, sec))
+ return -1;
+
+ if (!elf_create_reloc(e->out, klp_relocs,
+ klp_reloc_off + offsetof(struct klp_reloc, offset),
+ sec->sym, offset, R_ABS64))
+ return -1;
+
+ /* klp_reloc.sym */
+ if (!elf_create_reloc(e->out, klp_relocs,
+ klp_reloc_off + offsetof(struct klp_reloc, sym),
+ klp_sym, addend, R_ABS64))
+ return -1;
+
+ return 0;
+}
+
+#define dbg_clone_reloc(sec, offset, patched_sym, addend, export, klp) \
+ dbg_indent("%s+0x%lx: %s%s0x%lx [%s%s%s%s%s%s]", \
+ sec->name, offset, patched_sym->name, \
+ addend >= 0 ? "+" : "-", labs(addend), \
+ sym_type(patched_sym), \
+ patched_sym->type == STT_SECTION ? "" : " ", \
+ patched_sym->type == STT_SECTION ? "" : sym_bind(patched_sym), \
+ is_undef_sym(patched_sym) ? " UNDEF" : "", \
+ export ? " EXPORTED" : "", \
+ klp ? " KLP" : "")
+
+/* Copy a reloc and its symbol to the output object */
+static int clone_reloc(struct elfs *e, struct reloc *patched_reloc,
+ struct section *sec, unsigned long offset)
+{
+ struct symbol *patched_sym = patched_reloc->sym;
+ struct export *export = find_export(patched_sym);
+ long addend = reloc_addend(patched_reloc);
+ struct symbol *out_sym;
+ bool klp;
+
+ if (!is_reloc_allowed(patched_reloc)) {
+ ERROR_FUNC(patched_reloc->sec->base, reloc_offset(patched_reloc),
+ "missing symbol for reference to %s+%ld",
+ patched_sym->name, addend);
+ return -1;
+ }
+
+ klp = klp_reloc_needed(patched_reloc);
+
+ dbg_clone_reloc(sec, offset, patched_sym, addend, export, klp);
+
+ if (klp) {
+ if (clone_reloc_klp(e, patched_reloc, sec, offset, export))
+ return -1;
+
+ return 0;
+ }
+
+ /*
+ * Why !export sets 'data_too':
+ *
+ * Unexported non-klp symbols need to live in the patch module,
+ * otherwise there will be unresolved symbols. Notably, this includes:
+ *
+ * - New functions/data
+ * - String sections
+ * - Special section entries
+ * - Uncorrelated static local variables
+ * - UBSAN sections
+ */
+ out_sym = clone_symbol(e, patched_sym, patched_sym->included || !export);
+ if (!out_sym)
+ return -1;
+
+ /*
+ * For strings, all references use section symbols, thanks to
+ * section_reference_needed(). clone_symbol() has cloned an empty
+ * version of the string section. Now copy the string itself.
+ */
+ if (is_string_sec(patched_sym->sec)) {
+ const char *str = patched_sym->sec->data->d_buf + addend;
+
+ __dbg_indent("\"%s\"", escape_str(str));
+
+ addend = elf_add_string(e->out, out_sym->sec, str);
+ if (addend == -1)
+ return -1;
+ }
+
+ if (!elf_create_reloc(e->out, sec, offset, out_sym, addend,
+ reloc_type(patched_reloc)))
+ return -1;
+
+ return 0;
+}
+
+/* Copy all relocs needed for a symbol's contents */
+static int clone_sym_relocs(struct elfs *e, struct symbol *patched_sym)
+{
+ struct section *patched_rsec = patched_sym->sec->rsec;
+ struct reloc *patched_reloc;
+ unsigned long start, end;
+ struct symbol *out_sym;
+
+ out_sym = patched_sym->clone;
+ if (!out_sym) {
+ ERROR("no clone for %s", patched_sym->name);
+ return -1;
+ }
+
+ if (!patched_rsec)
+ return 0;
+
+ if (!is_sec_sym(patched_sym) && !patched_sym->len)
+ return 0;
+
+ if (is_string_sec(patched_sym->sec))
+ return 0;
+
+ if (is_sec_sym(patched_sym)) {
+ start = 0;
+ end = sec_size(patched_sym->sec);
+ } else {
+ start = patched_sym->offset;
+ end = start + patched_sym->len;
+ }
+
+ for_each_reloc(patched_rsec, patched_reloc) {
+ unsigned long offset;
+
+ if (reloc_offset(patched_reloc) < start ||
+ reloc_offset(patched_reloc) >= end)
+ continue;
+
+ /*
+ * Skip any reloc referencing .altinstr_aux. Its code is
+ * always patched by alternatives. See ALTERNATIVE_TERNARY().
+ */
+ if (patched_reloc->sym->sec &&
+ !strcmp(patched_reloc->sym->sec->name, ".altinstr_aux"))
+ continue;
+
+ if (convert_reloc_sym(e->patched, patched_reloc)) {
+ ERROR_FUNC(patched_rsec->base, reloc_offset(patched_reloc),
+ "failed to convert reloc sym '%s' to its proper format",
+ patched_reloc->sym->name);
+ return -1;
+ }
+
+ offset = out_sym->offset + (reloc_offset(patched_reloc) - patched_sym->offset);
+
+ if (clone_reloc(e, patched_reloc, out_sym->sec, offset))
+ return -1;
+ }
+ return 0;
+
+}
+
+static int create_fake_symbol(struct elf *elf, struct section *sec,
+ unsigned long offset, size_t size)
+{
+ char name[SYM_NAME_LEN];
+ unsigned int type;
+ static int ctr;
+ char *c;
+
+ if (snprintf_check(name, SYM_NAME_LEN, "%s_%d", sec->name, ctr++))
+ return -1;
+
+ for (c = name; *c; c++)
+ if (*c == '.')
+ *c = '_';
+
+ /*
+ * STT_NOTYPE: Prevent objtool from validating .altinstr_replacement
+ * while still allowing objdump to disassemble it.
+ */
+ type = is_text_sec(sec) ? STT_NOTYPE : STT_OBJECT;
+ return elf_create_symbol(elf, name, sec, STB_LOCAL, type, offset, size) ? 0 : -1;
+}
+
+/*
+ * Special sections (alternatives, etc) are basically arrays of structs.
+ * For all the special sections, create a symbol for each struct entry. This
+ * is a bit cumbersome, but it makes the extracting of the individual entries
+ * much more straightforward.
+ *
+ * There are three ways to identify the entry sizes for a special section:
+ *
+ * 1) ELF section header sh_entsize: Ideally this would be used almost
+ * everywhere. But unfortunately the toolchains make it difficult. The
+ * assembler .[push]section directive syntax only takes entsize when
+ * combined with SHF_MERGE. But Clang disallows combining SHF_MERGE with
+ * SHF_WRITE. And some special sections do need to be writable.
+ *
+ * Another place this wouldn't work is .altinstr_replacement, whose entries
+ * don't have a fixed size.
+ *
+ * 2) ANNOTATE_DATA_SPECIAL: This is a lightweight objtool annotation which
+ * points to the beginning of each entry. The size of the entry is then
+ * inferred by the location of the subsequent annotation (or end of
+ * section).
+ *
+ * 3) Simple array of pointers: If the special section is just a basic array of
+ * pointers, the entry size can be inferred by the number of relocations.
+ * No annotations needed.
+ *
+ * Note I also tried to create per-entry symbols at the time of creation, in
+ * the original [inline] asm. Unfortunately, creating uniquely named symbols
+ * is trickier than one might think, especially with Clang inline asm. I
+ * eventually just gave up trying to make that work, in favor of using
+ * ANNOTATE_DATA_SPECIAL and creating the symbols here after the fact.
+ */
+static int create_fake_symbols(struct elf *elf)
+{
+ struct section *sec;
+ struct reloc *reloc;
+
+ /*
+ * 1) Make symbols for all the ANNOTATE_DATA_SPECIAL entries:
+ */
+
+ sec = find_section_by_name(elf, ".discard.annotate_data");
+ if (!sec || !sec->rsec)
+ return 0;
+
+ for_each_reloc(sec->rsec, reloc) {
+ unsigned long offset, size;
+ struct reloc *next_reloc;
+
+ if (annotype(elf, sec, reloc) != ANNOTYPE_DATA_SPECIAL)
+ continue;
+
+ offset = reloc_addend(reloc);
+
+ size = 0;
+ next_reloc = reloc;
+ for_each_reloc_continue(sec->rsec, next_reloc) {
+ if (annotype(elf, sec, next_reloc) != ANNOTYPE_DATA_SPECIAL ||
+ next_reloc->sym->sec != reloc->sym->sec)
+ continue;
+
+ size = reloc_addend(next_reloc) - offset;
+ break;
+ }
+
+ if (!size)
+ size = sec_size(reloc->sym->sec) - offset;
+
+ if (create_fake_symbol(elf, reloc->sym->sec, offset, size))
+ return -1;
+ }
+
+ /*
+ * 2) Make symbols for sh_entsize, and simple arrays of pointers:
+ */
+
+ for_each_sec(elf, sec) {
+ unsigned int entry_size;
+ unsigned long offset;
+
+ if (!is_special_section(sec) || find_symbol_by_offset(sec, 0))
+ continue;
+
+ if (!sec->rsec) {
+ ERROR("%s: missing special section relocations", sec->name);
+ return -1;
+ }
+
+ entry_size = sec->sh.sh_entsize;
+ if (!entry_size) {
+ entry_size = arch_reloc_size(sec->rsec->relocs);
+ if (sec_size(sec) != entry_size * sec_num_entries(sec->rsec)) {
+ ERROR("%s: missing special section entsize or annotations", sec->name);
+ return -1;
+ }
+ }
+
+ for (offset = 0; offset < sec_size(sec); offset += entry_size) {
+ if (create_fake_symbol(elf, sec, offset, entry_size))
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/* Keep a special section entry if it references an included function */
+static bool should_keep_special_sym(struct elf *elf, struct symbol *sym)
+{
+ struct reloc *reloc;
+
+ if (is_sec_sym(sym) || !sym->sec->rsec)
+ return false;
+
+ sym_for_each_reloc(elf, sym, reloc) {
+ if (convert_reloc_sym(elf, reloc))
+ continue;
+
+ if (is_func_sym(reloc->sym) && reloc->sym->included)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Klp relocations aren't allowed for __jump_table and .static_call_sites if
+ * the referenced symbol lives in a kernel module, because such klp relocs may
+ * be applied after static branch/call init, resulting in code corruption.
+ *
+ * Validate a special section entry to avoid that. Note that an inert
+ * tracepoint is harmless enough, in that case just skip the entry and print a
+ * warning. Otherwise, return an error.
+ *
+ * This is only a temporary limitation which will be fixed when livepatch adds
+ * support for submodules: fully self-contained modules which are embedded in
+ * the top-level livepatch module's data and which can be loaded on demand when
+ * their corresponding to-be-patched module gets loaded. Then klp relocs can
+ * be retired.
+ *
+ * Return:
+ * -1: error: validation failed
+ * 1: warning: tracepoint skipped
+ * 0: success
+ */
+static int validate_special_section_klp_reloc(struct elfs *e, struct symbol *sym)
+{
+ bool static_branch = !strcmp(sym->sec->name, "__jump_table");
+ bool static_call = !strcmp(sym->sec->name, ".static_call_sites");
+ struct symbol *code_sym = NULL;
+ unsigned long code_offset = 0;
+ struct reloc *reloc;
+ int ret = 0;
+
+ if (!static_branch && !static_call)
+ return 0;
+
+ sym_for_each_reloc(e->patched, sym, reloc) {
+ const char *sym_modname;
+ struct export *export;
+
+ /* Static branch/call keys are always STT_OBJECT */
+ if (reloc->sym->type != STT_OBJECT) {
+
+ /* Save code location which can be printed below */
+ if (reloc->sym->type == STT_FUNC && !code_sym) {
+ code_sym = reloc->sym;
+ code_offset = reloc_addend(reloc);
+ }
+
+ continue;
+ }
+
+ if (!klp_reloc_needed(reloc))
+ continue;
+
+ export = find_export(reloc->sym);
+ if (export) {
+ sym_modname = export->mod;
+ } else {
+ sym_modname = find_modname(e);
+ if (!sym_modname)
+ return -1;
+ }
+
+ /* vmlinux keys are ok */
+ if (!strcmp(sym_modname, "vmlinux"))
+ continue;
+
+ if (static_branch) {
+ if (strstarts(reloc->sym->name, "__tracepoint_")) {
+ WARN("%s: disabling unsupported tracepoint %s",
+ code_sym->name, reloc->sym->name + 13);
+ ret = 1;
+ continue;
+ }
+
+ ERROR("%s+0x%lx: unsupported static branch key %s. Use static_key_enabled() instead",
+ code_sym->name, code_offset, reloc->sym->name);
+ return -1;
+ }
+
+ /* static call */
+ if (strstarts(reloc->sym->name, "__SCK__tp_func_")) {
+ ret = 1;
+ continue;
+ }
+
+ ERROR("%s()+0x%lx: unsupported static call key %s. Use KLP_STATIC_CALL() instead",
+ code_sym->name, code_offset, reloc->sym->name);
+ return -1;
+ }
+
+ return ret;
+}
+
+static int clone_special_section(struct elfs *e, struct section *patched_sec)
+{
+ struct symbol *patched_sym;
+
+ /*
+ * Extract all special section symbols (and their dependencies) which
+ * reference included functions.
+ */
+ sec_for_each_sym(patched_sec, patched_sym) {
+ int ret;
+
+ if (!is_object_sym(patched_sym))
+ continue;
+
+ if (!should_keep_special_sym(e->patched, patched_sym))
+ continue;
+
+ ret = validate_special_section_klp_reloc(e, patched_sym);
+ if (ret < 0)
+ return -1;
+ if (ret > 0)
+ continue;
+
+ if (!clone_symbol(e, patched_sym, true))
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Extract only the needed bits from special sections */
+static int clone_special_sections(struct elfs *e)
+{
+ struct section *patched_sec;
+
+ if (create_fake_symbols(e->patched))
+ return -1;
+
+ for_each_sec(e->patched, patched_sec) {
+ if (is_special_section(patched_sec)) {
+ if (clone_special_section(e, patched_sec))
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Create __klp_objects and __klp_funcs sections which are intermediate
+ * sections provided as input to the patch module's init code for building the
+ * klp_patch, klp_object and klp_func structs for the livepatch API.
+ */
+static int create_klp_sections(struct elfs *e)
+{
+ size_t obj_size = sizeof(struct klp_object_ext);
+ size_t func_size = sizeof(struct klp_func_ext);
+ struct section *obj_sec, *funcs_sec, *str_sec;
+ struct symbol *funcs_sym, *str_sym, *sym;
+ char sym_name[SYM_NAME_LEN];
+ unsigned int nr_funcs = 0;
+ const char *modname;
+ void *obj_data;
+ s64 addend;
+
+ obj_sec = elf_create_section_pair(e->out, KLP_OBJECTS_SEC, obj_size, 0, 0);
+ if (!obj_sec)
+ return -1;
+
+ funcs_sec = elf_create_section_pair(e->out, KLP_FUNCS_SEC, func_size, 0, 0);
+ if (!funcs_sec)
+ return -1;
+
+ funcs_sym = elf_create_section_symbol(e->out, funcs_sec);
+ if (!funcs_sym)
+ return -1;
+
+ str_sec = elf_create_section(e->out, KLP_STRINGS_SEC, 0, 0,
+ SHT_PROGBITS, 1,
+ SHF_ALLOC | SHF_STRINGS | SHF_MERGE);
+ if (!str_sec)
+ return -1;
+
+ if (elf_add_string(e->out, str_sec, "") == -1)
+ return -1;
+
+ str_sym = elf_create_section_symbol(e->out, str_sec);
+ if (!str_sym)
+ return -1;
+
+ /* allocate klp_object_ext */
+ obj_data = elf_add_data(e->out, obj_sec, NULL, obj_size);
+ if (!obj_data)
+ return -1;
+
+ modname = find_modname(e);
+ if (!modname)
+ return -1;
+
+ /* klp_object_ext.name */
+ if (strcmp(modname, "vmlinux")) {
+ addend = elf_add_string(e->out, str_sec, modname);
+ if (addend == -1)
+ return -1;
+
+ if (!elf_create_reloc(e->out, obj_sec,
+ offsetof(struct klp_object_ext, name),
+ str_sym, addend, R_ABS64))
+ return -1;
+ }
+
+ /* klp_object_ext.funcs */
+ if (!elf_create_reloc(e->out, obj_sec, offsetof(struct klp_object_ext, funcs),
+ funcs_sym, 0, R_ABS64))
+ return -1;
+
+ for_each_sym(e->out, sym) {
+ unsigned long offset = nr_funcs * func_size;
+ unsigned long sympos;
+ void *func_data;
+
+ if (!is_func_sym(sym) || sym->cold || !sym->clone || !sym->clone->changed)
+ continue;
+
+ /* allocate klp_func_ext */
+ func_data = elf_add_data(e->out, funcs_sec, NULL, func_size);
+ if (!func_data)
+ return -1;
+
+ /* klp_func_ext.old_name */
+ addend = elf_add_string(e->out, str_sec, sym->clone->twin->name);
+ if (addend == -1)
+ return -1;
+
+ if (!elf_create_reloc(e->out, funcs_sec,
+ offset + offsetof(struct klp_func_ext, old_name),
+ str_sym, addend, R_ABS64))
+ return -1;
+
+ /* klp_func_ext.new_func */
+ if (!elf_create_reloc(e->out, funcs_sec,
+ offset + offsetof(struct klp_func_ext, new_func),
+ sym, 0, R_ABS64))
+ return -1;
+
+ /* klp_func_ext.sympos */
+ BUILD_BUG_ON(sizeof(sympos) != sizeof_field(struct klp_func_ext, sympos));
+ sympos = find_sympos(e->orig, sym->clone->twin);
+ if (sympos == ULONG_MAX)
+ return -1;
+ memcpy(func_data + offsetof(struct klp_func_ext, sympos), &sympos,
+ sizeof_field(struct klp_func_ext, sympos));
+
+ nr_funcs++;
+ }
+
+ /* klp_object_ext.nr_funcs */
+ BUILD_BUG_ON(sizeof(nr_funcs) != sizeof_field(struct klp_object_ext, nr_funcs));
+ memcpy(obj_data + offsetof(struct klp_object_ext, nr_funcs), &nr_funcs,
+ sizeof_field(struct klp_object_ext, nr_funcs));
+
+ /*
+ * Find callback pointers created by KLP_PRE_PATCH_CALLBACK() and
+ * friends, and add them to the klp object.
+ */
+
+ if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_PRE_PATCH_PREFIX "%s", modname))
+ return -1;
+
+ sym = find_symbol_by_name(e->out, sym_name);
+ if (sym) {
+ struct reloc *reloc;
+
+ reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset);
+
+ if (!elf_create_reloc(e->out, obj_sec,
+ offsetof(struct klp_object_ext, callbacks) +
+ offsetof(struct klp_callbacks, pre_patch),
+ reloc->sym, reloc_addend(reloc), R_ABS64))
+ return -1;
+ }
+
+ if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_POST_PATCH_PREFIX "%s", modname))
+ return -1;
+
+ sym = find_symbol_by_name(e->out, sym_name);
+ if (sym) {
+ struct reloc *reloc;
+
+ reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset);
+
+ if (!elf_create_reloc(e->out, obj_sec,
+ offsetof(struct klp_object_ext, callbacks) +
+ offsetof(struct klp_callbacks, post_patch),
+ reloc->sym, reloc_addend(reloc), R_ABS64))
+ return -1;
+ }
+
+ if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_PRE_UNPATCH_PREFIX "%s", modname))
+ return -1;
+
+ sym = find_symbol_by_name(e->out, sym_name);
+ if (sym) {
+ struct reloc *reloc;
+
+ reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset);
+
+ if (!elf_create_reloc(e->out, obj_sec,
+ offsetof(struct klp_object_ext, callbacks) +
+ offsetof(struct klp_callbacks, pre_unpatch),
+ reloc->sym, reloc_addend(reloc), R_ABS64))
+ return -1;
+ }
+
+ if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_POST_UNPATCH_PREFIX "%s", modname))
+ return -1;
+
+ sym = find_symbol_by_name(e->out, sym_name);
+ if (sym) {
+ struct reloc *reloc;
+
+ reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset);
+
+ if (!elf_create_reloc(e->out, obj_sec,
+ offsetof(struct klp_object_ext, callbacks) +
+ offsetof(struct klp_callbacks, post_unpatch),
+ reloc->sym, reloc_addend(reloc), R_ABS64))
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy all .modinfo import_ns= tags to ensure all namespaced exported symbols
+ * can be accessed via normal relocs.
+ */
+static int copy_import_ns(struct elfs *e)
+{
+ struct section *patched_sec, *out_sec = NULL;
+ char *import_ns, *data_end;
+
+ patched_sec = find_section_by_name(e->patched, ".modinfo");
+ if (!patched_sec)
+ return 0;
+
+ import_ns = patched_sec->data->d_buf;
+ if (!import_ns)
+ return 0;
+
+ for (data_end = import_ns + sec_size(patched_sec);
+ import_ns < data_end;
+ import_ns += strlen(import_ns) + 1) {
+
+ import_ns = memmem(import_ns, data_end - import_ns, "import_ns=", 10);
+ if (!import_ns)
+ return 0;
+
+ if (!out_sec) {
+ out_sec = find_section_by_name(e->out, ".modinfo");
+ if (!out_sec) {
+ out_sec = elf_create_section(e->out, ".modinfo", 0,
+ patched_sec->sh.sh_entsize,
+ patched_sec->sh.sh_type,
+ patched_sec->sh.sh_addralign,
+ patched_sec->sh.sh_flags);
+ if (!out_sec)
+ return -1;
+ }
+ }
+
+ if (!elf_add_data(e->out, out_sec, import_ns, strlen(import_ns) + 1))
+ return -1;
+ }
+
+ return 0;
+}
+
+int cmd_klp_diff(int argc, const char **argv)
+{
+ struct elfs e = {0};
+
+ argc = parse_options(argc, argv, klp_diff_options, klp_diff_usage, 0);
+ if (argc != 3)
+ usage_with_options(klp_diff_usage, klp_diff_options);
+
+ objname = argv[0];
+
+ e.orig = elf_open_read(argv[0], O_RDONLY);
+ e.patched = elf_open_read(argv[1], O_RDONLY);
+ e.out = NULL;
+
+ if (!e.orig || !e.patched)
+ return -1;
+
+ if (read_exports())
+ return -1;
+
+ if (read_sym_checksums(e.orig))
+ return -1;
+
+ if (read_sym_checksums(e.patched))
+ return -1;
+
+ if (correlate_symbols(&e))
+ return -1;
+
+ if (mark_changed_functions(&e))
+ return 0;
+
+ e.out = elf_create_file(&e.orig->ehdr, argv[2]);
+ if (!e.out)
+ return -1;
+
+ if (clone_included_functions(&e))
+ return -1;
+
+ if (clone_special_sections(&e))
+ return -1;
+
+ if (create_klp_sections(&e))
+ return -1;
+
+ if (copy_import_ns(&e))
+ return -1;
+
+ if (elf_write(e.out))
+ return -1;
+
+ return elf_close(e.out);
+}
diff --git a/tools/objtool/klp-post-link.c b/tools/objtool/klp-post-link.c
new file mode 100644
index 000000000000..c013e39957b1
--- /dev/null
+++ b/tools/objtool/klp-post-link.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Read the intermediate KLP reloc/symbol representations created by klp diff
+ * and convert them to the proper format required by livepatch. This needs to
+ * run last to avoid linker wreckage. Linkers don't tend to handle the "two
+ * rela sections for a single base section" case very well, nor do they like
+ * SHN_LIVEPATCH.
+ *
+ * This is the final tool in the livepatch module generation pipeline:
+ *
+ * kernel builds -> objtool klp diff -> module link -> objtool klp post-link
+ */
+
+#include <fcntl.h>
+#include <gelf.h>
+#include <objtool/objtool.h>
+#include <objtool/warn.h>
+#include <objtool/klp.h>
+#include <objtool/util.h>
+#include <linux/livepatch_external.h>
+
+static int fix_klp_relocs(struct elf *elf)
+{
+ struct section *symtab, *klp_relocs;
+
+ klp_relocs = find_section_by_name(elf, KLP_RELOCS_SEC);
+ if (!klp_relocs)
+ return 0;
+
+ symtab = find_section_by_name(elf, ".symtab");
+ if (!symtab) {
+ ERROR("missing .symtab");
+ return -1;
+ }
+
+ for (int i = 0; i < sec_size(klp_relocs) / sizeof(struct klp_reloc); i++) {
+ struct klp_reloc *klp_reloc;
+ unsigned long klp_reloc_off;
+ struct section *sec, *tmp, *klp_rsec;
+ unsigned long offset;
+ struct reloc *reloc;
+ char sym_modname[64];
+ char rsec_name[SEC_NAME_LEN];
+ u64 addend;
+ struct symbol *sym, *klp_sym;
+
+ klp_reloc_off = i * sizeof(*klp_reloc);
+ klp_reloc = klp_relocs->data->d_buf + klp_reloc_off;
+
+ /*
+ * Read __klp_relocs[i]:
+ */
+
+ /* klp_reloc.sec_offset */
+ reloc = find_reloc_by_dest(elf, klp_relocs,
+ klp_reloc_off + offsetof(struct klp_reloc, offset));
+ if (!reloc) {
+ ERROR("malformed " KLP_RELOCS_SEC " section");
+ return -1;
+ }
+
+ sec = reloc->sym->sec;
+ offset = reloc_addend(reloc);
+
+ /* klp_reloc.sym */
+ reloc = find_reloc_by_dest(elf, klp_relocs,
+ klp_reloc_off + offsetof(struct klp_reloc, sym));
+ if (!reloc) {
+ ERROR("malformed " KLP_RELOCS_SEC " section");
+ return -1;
+ }
+
+ klp_sym = reloc->sym;
+ addend = reloc_addend(reloc);
+
+ /* symbol format: .klp.sym.modname.sym_name,sympos */
+ if (sscanf(klp_sym->name + strlen(KLP_SYM_PREFIX), "%55[^.]", sym_modname) != 1)
+ ERROR("can't find modname in klp symbol '%s'", klp_sym->name);
+
+ /*
+ * Create the KLP rela:
+ */
+
+ /* section format: .klp.rela.sec_objname.section_name */
+ if (snprintf_check(rsec_name, SEC_NAME_LEN,
+ KLP_RELOC_SEC_PREFIX "%s.%s",
+ sym_modname, sec->name))
+ return -1;
+
+ klp_rsec = find_section_by_name(elf, rsec_name);
+ if (!klp_rsec) {
+ klp_rsec = elf_create_section(elf, rsec_name, 0,
+ elf_rela_size(elf),
+ SHT_RELA, elf_addr_size(elf),
+ SHF_ALLOC | SHF_INFO_LINK | SHF_RELA_LIVEPATCH);
+ if (!klp_rsec)
+ return -1;
+
+ klp_rsec->sh.sh_link = symtab->idx;
+ klp_rsec->sh.sh_info = sec->idx;
+ klp_rsec->base = sec;
+ }
+
+ tmp = sec->rsec;
+ sec->rsec = klp_rsec;
+ if (!elf_create_reloc(elf, sec, offset, klp_sym, addend, klp_reloc->type))
+ return -1;
+ sec->rsec = tmp;
+
+ /*
+ * Fix up the corresponding KLP symbol:
+ */
+
+ klp_sym->sym.st_shndx = SHN_LIVEPATCH;
+ if (!gelf_update_sym(symtab->data, klp_sym->idx, &klp_sym->sym)) {
+ ERROR_ELF("gelf_update_sym");
+ return -1;
+ }
+
+ /*
+ * Disable the original non-KLP reloc by converting it to R_*_NONE:
+ */
+
+ reloc = find_reloc_by_dest(elf, sec, offset);
+ sym = reloc->sym;
+ sym->sym.st_shndx = SHN_LIVEPATCH;
+ set_reloc_type(elf, reloc, 0);
+ if (!gelf_update_sym(symtab->data, sym->idx, &sym->sym)) {
+ ERROR_ELF("gelf_update_sym");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * This runs on the livepatch module after all other linking has been done. It
+ * converts the intermediate __klp_relocs section into proper KLP relocs to be
+ * processed by livepatch. This needs to run last to avoid linker wreckage.
+ * Linkers don't tend to handle the "two rela sections for a single base
+ * section" case very well, nor do they appreciate SHN_LIVEPATCH.
+ */
+int cmd_klp_post_link(int argc, const char **argv)
+{
+ struct elf *elf;
+
+ argc--;
+ argv++;
+
+ if (argc != 1) {
+ fprintf(stderr, "%d\n", argc);
+ fprintf(stderr, "usage: objtool link <file.ko>\n");
+ return -1;
+ }
+
+ elf = elf_open_read(argv[0], O_RDWR);
+ if (!elf)
+ return -1;
+
+ if (fix_klp_relocs(elf))
+ return -1;
+
+ if (elf_write(elf))
+ return -1;
+
+ return elf_close(elf);
+}
diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
index 802895fae3ca..14f8ab653449 100644
--- a/tools/objtool/noreturns.h
+++ b/tools/objtool/noreturns.h
@@ -36,6 +36,7 @@ NORETURN(machine_real_restart)
NORETURN(make_task_dead)
NORETURN(mpt_halt_firmware)
NORETURN(mwait_play_dead)
+NORETURN(native_play_dead)
NORETURN(nmi_panic_self_stop)
NORETURN(panic)
NORETURN(vpanic)
diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index 5c8b974ad0f9..3c26ed561c7e 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -16,7 +16,8 @@
#include <objtool/objtool.h>
#include <objtool/warn.h>
-bool help;
+bool debug;
+int indent;
static struct objtool_file file;
@@ -71,6 +72,39 @@ int objtool_pv_add(struct objtool_file *f, int idx, struct symbol *func)
return 0;
}
+char *top_level_dir(const char *file)
+{
+ ssize_t len, self_len, file_len;
+ char self[PATH_MAX], *str;
+ int i;
+
+ len = readlink("/proc/self/exe", self, sizeof(self) - 1);
+ if (len <= 0)
+ return NULL;
+ self[len] = '\0';
+
+ for (i = 0; i < 3; i++) {
+ char *s = strrchr(self, '/');
+ if (!s)
+ return NULL;
+ *s = '\0';
+ }
+
+ self_len = strlen(self);
+ file_len = strlen(file);
+
+ str = malloc(self_len + file_len + 2);
+ if (!str)
+ return NULL;
+
+ memcpy(str, self, self_len);
+ str[self_len] = '/';
+ strcpy(str + self_len + 1, file);
+
+ return str;
+}
+
+
int main(int argc, const char **argv)
{
static const char *UNUSED = "OBJTOOL_NOT_IMPLEMENTED";
@@ -79,5 +113,11 @@ int main(int argc, const char **argv)
exec_cmd_init("objtool", UNUSED, UNUSED, UNUSED);
pager_init(UNUSED);
+ if (argc > 1 && !strcmp(argv[1], "klp")) {
+ argc--;
+ argv++;
+ return cmd_klp(argc, argv);
+ }
+
return objtool_run(argc, argv);
}
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
index 1dd9fc18fe62..5a979f52425a 100644
--- a/tools/objtool/orc_dump.c
+++ b/tools/objtool/orc_dump.c
@@ -8,7 +8,6 @@
#include <objtool/objtool.h>
#include <objtool/orc.h>
#include <objtool/warn.h>
-#include <objtool/endianness.h>
int orc_dump(const char *filename)
{
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index 922e6aac7cea..1045e1380ffd 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -12,7 +12,6 @@
#include <objtool/check.h>
#include <objtool/orc.h>
#include <objtool/warn.h>
-#include <objtool/endianness.h>
struct orc_list_entry {
struct list_head list;
@@ -57,7 +56,7 @@ int orc_create(struct objtool_file *file)
/* Build a deduplicated list of ORC entries: */
INIT_LIST_HEAD(&orc_list);
- for_each_sec(file, sec) {
+ for_each_sec(file->elf, sec) {
struct orc_entry orc, prev_orc = {0};
struct instruction *insn;
bool empty = true;
@@ -127,7 +126,11 @@ int orc_create(struct objtool_file *file)
return -1;
}
orc_sec = elf_create_section(file->elf, ".orc_unwind",
- sizeof(struct orc_entry), nr);
+ nr * sizeof(struct orc_entry),
+ sizeof(struct orc_entry),
+ SHT_PROGBITS,
+ 1,
+ SHF_ALLOC);
if (!orc_sec)
return -1;
diff --git a/tools/objtool/special.c b/tools/objtool/special.c
index c80fed8a840e..2a533afbc69a 100644
--- a/tools/objtool/special.c
+++ b/tools/objtool/special.c
@@ -15,7 +15,6 @@
#include <objtool/builtin.h>
#include <objtool/special.h>
#include <objtool/warn.h>
-#include <objtool/endianness.h>
struct special_entry {
const char *sec;
@@ -82,6 +81,8 @@ static int get_alt_entry(struct elf *elf, const struct special_entry *entry,
entry->orig_len);
alt->new_len = *(unsigned char *)(sec->data->d_buf + offset +
entry->new_len);
+ alt->feature = *(unsigned int *)(sec->data->d_buf + offset +
+ entry->feature);
}
orig_reloc = find_reloc_by_dest(elf, sec, offset + entry->orig);
@@ -133,7 +134,7 @@ int special_get_alts(struct elf *elf, struct list_head *alts)
struct section *sec;
unsigned int nr_entries;
struct special_alt *alt;
- int idx, ret;
+ int idx;
INIT_LIST_HEAD(alts);
@@ -142,12 +143,12 @@ int special_get_alts(struct elf *elf, struct list_head *alts)
if (!sec)
continue;
- if (sec->sh.sh_size % entry->size != 0) {
+ if (sec_size(sec) % entry->size != 0) {
ERROR("%s size not a multiple of %d", sec->name, entry->size);
return -1;
}
- nr_entries = sec->sh.sh_size / entry->size;
+ nr_entries = sec_size(sec) / entry->size;
for (idx = 0; idx < nr_entries; idx++) {
alt = malloc(sizeof(*alt));
@@ -157,11 +158,8 @@ int special_get_alts(struct elf *elf, struct list_head *alts)
}
memset(alt, 0, sizeof(*alt));
- ret = get_alt_entry(elf, entry, sec, idx, alt);
- if (ret > 0)
- continue;
- if (ret < 0)
- return ret;
+ if (get_alt_entry(elf, entry, sec, idx, alt))
+ return -1;
list_add_tail(&alt->list, alts);
}
diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh
index 81d120d05442..e38167ca56a9 100755
--- a/tools/objtool/sync-check.sh
+++ b/tools/objtool/sync-check.sh
@@ -16,6 +16,8 @@ arch/x86/include/asm/orc_types.h
arch/x86/include/asm/emulate_prefix.h
arch/x86/lib/x86-opcode-map.txt
arch/x86/tools/gen-insn-attr-x86.awk
+include/linux/interval_tree_generic.h
+include/linux/livepatch_external.h
include/linux/static_call_types.h
"
diff --git a/tools/objtool/trace.c b/tools/objtool/trace.c
new file mode 100644
index 000000000000..5dec44dab781
--- /dev/null
+++ b/tools/objtool/trace.c
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates.
+ */
+
+#include <objtool/trace.h>
+
+bool trace;
+int trace_depth;
+
+/*
+ * Macros to trace CFI state attributes changes.
+ */
+
+#define TRACE_CFI_ATTR(attr, prev, next, fmt, ...) \
+({ \
+ if ((prev)->attr != (next)->attr) \
+ TRACE("%s=" fmt " ", #attr, __VA_ARGS__); \
+})
+
+#define TRACE_CFI_ATTR_BOOL(attr, prev, next) \
+ TRACE_CFI_ATTR(attr, prev, next, \
+ "%s", (next)->attr ? "true" : "false")
+
+#define TRACE_CFI_ATTR_NUM(attr, prev, next, fmt) \
+ TRACE_CFI_ATTR(attr, prev, next, fmt, (next)->attr)
+
+#define CFI_REG_NAME_MAXLEN 16
+
+/*
+ * Return the name of a register. Note that the same static buffer
+ * is returned if the name is dynamically generated.
+ */
+static const char *cfi_reg_name(unsigned int reg)
+{
+ static char rname_buffer[CFI_REG_NAME_MAXLEN];
+ const char *rname;
+
+ switch (reg) {
+ case CFI_UNDEFINED:
+ return "<undefined>";
+ case CFI_CFA:
+ return "cfa";
+ case CFI_SP_INDIRECT:
+ return "(sp)";
+ case CFI_BP_INDIRECT:
+ return "(bp)";
+ }
+
+ if (reg < CFI_NUM_REGS) {
+ rname = arch_reg_name[reg];
+ if (rname)
+ return rname;
+ }
+
+ if (snprintf(rname_buffer, CFI_REG_NAME_MAXLEN, "r%d", reg) == -1)
+ return "<error>";
+
+ return (const char *)rname_buffer;
+}
+
+/*
+ * Functions and macros to trace CFI registers changes.
+ */
+
+static void trace_cfi_reg(const char *prefix, int reg, const char *fmt,
+ int base_prev, int offset_prev,
+ int base_next, int offset_next)
+{
+ char *rname;
+
+ if (base_prev == base_next && offset_prev == offset_next)
+ return;
+
+ if (prefix)
+ TRACE("%s:", prefix);
+
+ if (base_next == CFI_UNDEFINED) {
+ TRACE("%1$s=<undef> ", cfi_reg_name(reg));
+ } else {
+ rname = strdup(cfi_reg_name(reg));
+ TRACE(fmt, rname, cfi_reg_name(base_next), offset_next);
+ free(rname);
+ }
+}
+
+static void trace_cfi_reg_val(const char *prefix, int reg,
+ int base_prev, int offset_prev,
+ int base_next, int offset_next)
+{
+ trace_cfi_reg(prefix, reg, "%1$s=%2$s%3$+d ",
+ base_prev, offset_prev, base_next, offset_next);
+}
+
+static void trace_cfi_reg_ref(const char *prefix, int reg,
+ int base_prev, int offset_prev,
+ int base_next, int offset_next)
+{
+ trace_cfi_reg(prefix, reg, "%1$s=(%2$s%3$+d) ",
+ base_prev, offset_prev, base_next, offset_next);
+}
+
+#define TRACE_CFI_REG_VAL(reg, prev, next) \
+ trace_cfi_reg_val(NULL, reg, prev.base, prev.offset, \
+ next.base, next.offset)
+
+#define TRACE_CFI_REG_REF(reg, prev, next) \
+ trace_cfi_reg_ref(NULL, reg, prev.base, prev.offset, \
+ next.base, next.offset)
+
+void trace_insn_state(struct instruction *insn, struct insn_state *sprev,
+ struct insn_state *snext)
+{
+ struct cfi_state *cprev, *cnext;
+ int i;
+
+ if (!memcmp(sprev, snext, sizeof(struct insn_state)))
+ return;
+
+ cprev = &sprev->cfi;
+ cnext = &snext->cfi;
+
+ disas_print_insn(stderr, objtool_disas_ctx, insn,
+ trace_depth - 1, "state: ");
+
+ /* print registers changes */
+ TRACE_CFI_REG_VAL(CFI_CFA, cprev->cfa, cnext->cfa);
+ for (i = 0; i < CFI_NUM_REGS; i++) {
+ TRACE_CFI_REG_VAL(i, cprev->vals[i], cnext->vals[i]);
+ TRACE_CFI_REG_REF(i, cprev->regs[i], cnext->regs[i]);
+ }
+
+ /* print attributes changes */
+ TRACE_CFI_ATTR_NUM(stack_size, cprev, cnext, "%d");
+ TRACE_CFI_ATTR_BOOL(drap, cprev, cnext);
+ if (cnext->drap) {
+ trace_cfi_reg_val("drap", cnext->drap_reg,
+ cprev->drap_reg, cprev->drap_offset,
+ cnext->drap_reg, cnext->drap_offset);
+ }
+ TRACE_CFI_ATTR_BOOL(bp_scratch, cprev, cnext);
+ TRACE_CFI_ATTR_NUM(instr, sprev, snext, "%d");
+ TRACE_CFI_ATTR_NUM(uaccess_stack, sprev, snext, "%u");
+
+ TRACE("\n");
+
+ insn->trace = 1;
+}
+
+void trace_alt_begin(struct instruction *orig_insn, struct alternative *alt,
+ char *alt_name)
+{
+ struct instruction *alt_insn;
+ char suffix[2];
+
+ alt_insn = alt->insn;
+
+ if (alt->type == ALT_TYPE_EX_TABLE) {
+ /*
+ * When there is an exception table then the instruction
+ * at the original location is executed but it can cause
+ * an exception. In that case, the execution will be
+ * redirected to the alternative instruction.
+ *
+ * The instruction at the original location can have
+ * instruction alternatives, so we just print the location
+ * of the instruction that can cause the exception and
+ * not the instruction itself.
+ */
+ TRACE_ALT_INFO_NOADDR(orig_insn, "/ ", "%s for instruction at 0x%lx <%s+0x%lx>",
+ alt_name,
+ orig_insn->offset, orig_insn->sym->name,
+ orig_insn->offset - orig_insn->sym->offset);
+ } else {
+ TRACE_ALT_INFO_NOADDR(orig_insn, "/ ", "%s", alt_name);
+ }
+
+ if (alt->type == ALT_TYPE_JUMP_TABLE) {
+ /*
+ * For a jump alternative, if the default instruction is
+ * a NOP then it is replaced with the jmp instruction,
+ * otherwise it is replaced with a NOP instruction.
+ */
+ trace_depth++;
+ if (orig_insn->type == INSN_NOP) {
+ suffix[0] = (orig_insn->len == 5) ? 'q' : '\0';
+ TRACE_ADDR(orig_insn, "jmp%-3s %lx <%s+0x%lx>", suffix,
+ alt_insn->offset, alt_insn->sym->name,
+ alt_insn->offset - alt_insn->sym->offset);
+ } else {
+ TRACE_ADDR(orig_insn, "nop%d", orig_insn->len);
+ trace_depth--;
+ }
+ }
+}
+
+void trace_alt_end(struct instruction *orig_insn, struct alternative *alt,
+ char *alt_name)
+{
+ if (alt->type == ALT_TYPE_JUMP_TABLE && orig_insn->type == INSN_NOP)
+ trace_depth--;
+ TRACE_ALT_INFO_NOADDR(orig_insn, "\\ ", "%s", alt_name);
+}
diff --git a/tools/objtool/weak.c b/tools/objtool/weak.c
index d83f607733b0..d6562f292259 100644
--- a/tools/objtool/weak.c
+++ b/tools/objtool/weak.c
@@ -8,6 +8,8 @@
#include <stdbool.h>
#include <errno.h>
#include <objtool/objtool.h>
+#include <objtool/arch.h>
+#include <objtool/builtin.h>
#define UNSUPPORTED(name) \
({ \
@@ -24,3 +26,8 @@ int __weak orc_create(struct objtool_file *file)
{
UNSUPPORTED("ORC");
}
+
+int __weak cmd_klp(int argc, const char **argv)
+{
+ UNSUPPORTED("klp");
+}
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 5700516aa84a..2dd5f5a60568 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -354,9 +354,6 @@ FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
FEATURE_CHECK_LDFLAGS-libaio = -lrt
-FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl
-FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl
-
CORE_CFLAGS += -fno-omit-frame-pointer
CORE_CFLAGS += -Wall
CORE_CFLAGS += -Wextra
@@ -930,6 +927,8 @@ ifdef BUILD_NONDISTRO
ifeq ($(feature-libbfd), 1)
EXTLIBS += -lbfd -lopcodes
+ FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl
+ FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl
else
# we are on a system that requires -liberty and (maybe) -lz
# to link against -lbfd; test each case individually here
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 47c906b807ef..02f87c49801f 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -234,12 +234,12 @@ endif
# The fixdep build - we force fixdep tool to be built as
# the first target in the separate make session not to be
# disturbed by any parallel make jobs. Once fixdep is done
-# we issue the requested build with FIXDEP=1 variable.
+# we issue the requested build with FIXDEP_BUILT=1 variable.
#
# The fixdep build is disabled for $(NON_CONFIG_TARGETS)
# targets, because it's not necessary.
-ifdef FIXDEP
+ifdef FIXDEP_BUILT
force_fixdep := 0
else
force_fixdep := $(config)
@@ -286,7 +286,7 @@ $(goals) all: sub-make
sub-make: fixdep
@./check-headers.sh
- $(Q)$(MAKE) FIXDEP=1 -f Makefile.perf $(goals)
+ $(Q)$(MAKE) FIXDEP_BUILT=1 -f Makefile.perf $(goals)
else # force_fixdep
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
index 92cf0fe2291e..ced2a1deecd7 100644
--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@
333 common io_pgetevents sys_io_pgetevents
334 common rseq sys_rseq
335 common uretprobe sys_uretprobe
+336 common uprobe sys_uprobe
# don't use numbers 387 through 423, add new calls after the last
# 'common' entry
424 common pidfd_send_signal sys_pidfd_send_signal
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 078634461df2..e8962c985d34 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -1867,6 +1867,7 @@ static int __cmd_report(bool display_info)
eops.sample = process_sample_event;
eops.comm = perf_event__process_comm;
eops.mmap = perf_event__process_mmap;
+ eops.mmap2 = perf_event__process_mmap2;
eops.namespaces = perf_event__process_namespaces;
eops.tracing_data = perf_event__process_tracing_data;
session = perf_session__new(&data, &eops);
@@ -2023,6 +2024,7 @@ static int __cmd_contention(int argc, const char **argv)
eops.sample = process_sample_event;
eops.comm = perf_event__process_comm;
eops.mmap = perf_event__process_mmap;
+ eops.mmap2 = perf_event__process_mmap2;
eops.tracing_data = perf_event__process_tracing_data;
perf_env__init(&host_env);
diff --git a/tools/perf/tests/shell/lock_contention.sh b/tools/perf/tests/shell/lock_contention.sh
index 7248a74ca2a3..6dd90519f45c 100755
--- a/tools/perf/tests/shell/lock_contention.sh
+++ b/tools/perf/tests/shell/lock_contention.sh
@@ -13,15 +13,18 @@ cleanup() {
rm -f ${perfdata}
rm -f ${result}
rm -f ${errout}
- trap - EXIT TERM INT
+ trap - EXIT TERM INT ERR
}
trap_cleanup() {
+ if (( $? == 139 )); then #SIGSEGV
+ err=1
+ fi
echo "Unexpected signal in ${FUNCNAME[1]}"
cleanup
exit ${err}
}
-trap trap_cleanup EXIT TERM INT
+trap trap_cleanup EXIT TERM INT ERR
check() {
if [ "$(id -u)" != 0 ]; then
@@ -145,7 +148,7 @@ test_aggr_cgroup()
fi
# the perf lock contention output goes to the stderr
- perf lock con -a -b -g -E 1 -q -- perf bench sched messaging -p > /dev/null 2> ${result}
+ perf lock con -a -b --lock-cgroup -E 1 -q -- perf bench sched messaging -p > /dev/null 2> ${result}
if [ "$(cat "${result}" | wc -l)" != "1" ]; then
echo "[Fail] BPF result count is not 1:" "$(cat "${result}" | wc -l)"
err=1
@@ -271,7 +274,7 @@ test_cgroup_filter()
return
fi
- perf lock con -a -b -g -E 1 -F wait_total -q -- perf bench sched messaging -p > /dev/null 2> ${result}
+ perf lock con -a -b --lock-cgroup -E 1 -F wait_total -q -- perf bench sched messaging -p > /dev/null 2> ${result}
if [ "$(cat "${result}" | wc -l)" != "1" ]; then
echo "[Fail] BPF result should have a cgroup result:" "$(cat "${result}")"
err=1
@@ -279,7 +282,7 @@ test_cgroup_filter()
fi
cgroup=$(cat "${result}" | awk '{ print $3 }')
- perf lock con -a -b -g -E 1 -G "${cgroup}" -q -- perf bench sched messaging -p > /dev/null 2> ${result}
+ perf lock con -a -b --lock-cgroup -E 1 -G "${cgroup}" -q -- perf bench sched messaging -p > /dev/null 2> ${result}
if [ "$(cat "${result}" | wc -l)" != "1" ]; then
echo "[Fail] BPF result should have a result with cgroup filter:" "$(cat "${cgroup}")"
err=1
@@ -338,4 +341,5 @@ test_aggr_task_stack_filter
test_cgroup_filter
test_csv_output
+cleanup
exit ${err}
diff --git a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h
index f291ab4f94eb..3741ea1b73d8 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h
@@ -111,6 +111,7 @@
#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */
#define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */
+#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */
#define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */
/* Generic flags for the *at(2) family of syscalls. */
diff --git a/tools/perf/trace/beauty/include/uapi/linux/fs.h b/tools/perf/trace/beauty/include/uapi/linux/fs.h
index 0bd678a4a10e..beb4c2d1e41c 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/fs.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/fs.h
@@ -430,10 +430,13 @@ typedef int __bitwise __kernel_rwf_t;
/* buffered IO that drops the cache after reading or writing data */
#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080)
+/* prevent pipe and socket writes from raising SIGPIPE */
+#define RWF_NOSIGNAL ((__force __kernel_rwf_t)0x00000100)
+
/* mask of flags supported by the kernel */
#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\
- RWF_DONTCACHE)
+ RWF_DONTCACHE | RWF_NOSIGNAL)
#define PROCFS_IOCTL_MAGIC 'f'
diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h
index ed3aed264aeb..51c4e8c82b1e 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/prctl.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h
@@ -177,7 +177,17 @@ struct prctl_mm_map {
#define PR_GET_TID_ADDRESS 40
+/*
+ * Flags for PR_SET_THP_DISABLE are only applicable when disabling. Bit 0
+ * is reserved, so PR_GET_THP_DISABLE can return "1 | flags", to effectively
+ * return "1" when no flags were specified for PR_SET_THP_DISABLE.
+ */
#define PR_SET_THP_DISABLE 41
+/*
+ * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE /
+ * VM_HUGEPAGE, MADV_COLLAPSE).
+ */
+# define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1)
#define PR_GET_THP_DISABLE 42
/*
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 4f2a6e10ed5c..4e12be579140 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1022,12 +1022,9 @@ static int write_bpf_prog_info(struct feat_fd *ff,
down_read(&env->bpf_progs.lock);
- if (env->bpf_progs.infos_cnt == 0)
- goto out;
-
ret = do_write(ff, &env->bpf_progs.infos_cnt,
sizeof(env->bpf_progs.infos_cnt));
- if (ret < 0)
+ if (ret < 0 || env->bpf_progs.infos_cnt == 0)
goto out;
root = &env->bpf_progs.infos;
@@ -1067,13 +1064,10 @@ static int write_bpf_btf(struct feat_fd *ff,
down_read(&env->bpf_progs.lock);
- if (env->bpf_progs.btfs_cnt == 0)
- goto out;
-
ret = do_write(ff, &env->bpf_progs.btfs_cnt,
sizeof(env->bpf_progs.btfs_cnt));
- if (ret < 0)
+ if (ret < 0 || env->bpf_progs.btfs_cnt == 0)
goto out;
root = &env->bpf_progs.btfs;
diff --git a/tools/perf/util/libbfd.c b/tools/perf/util/libbfd.c
index 01147fbf73b3..6434c2dccd4a 100644
--- a/tools/perf/util/libbfd.c
+++ b/tools/perf/util/libbfd.c
@@ -38,6 +38,39 @@ struct a2l_data {
asymbol **syms;
};
+static bool perf_bfd_lock(void *bfd_mutex)
+{
+ mutex_lock(bfd_mutex);
+ return true;
+}
+
+static bool perf_bfd_unlock(void *bfd_mutex)
+{
+ mutex_unlock(bfd_mutex);
+ return true;
+}
+
+static void perf_bfd_init(void)
+{
+ static struct mutex bfd_mutex;
+
+ mutex_init_recursive(&bfd_mutex);
+
+ if (bfd_init() != BFD_INIT_MAGIC) {
+ pr_err("Error initializing libbfd\n");
+ return;
+ }
+ if (!bfd_thread_init(perf_bfd_lock, perf_bfd_unlock, &bfd_mutex))
+ pr_err("Error initializing libbfd threading\n");
+}
+
+static void ensure_bfd_init(void)
+{
+ static pthread_once_t bfd_init_once = PTHREAD_ONCE_INIT;
+
+ pthread_once(&bfd_init_once, perf_bfd_init);
+}
+
static int bfd_error(const char *string)
{
const char *errmsg;
@@ -132,6 +165,7 @@ static struct a2l_data *addr2line_init(const char *path)
bfd *abfd;
struct a2l_data *a2l = NULL;
+ ensure_bfd_init();
abfd = bfd_openr(path, NULL);
if (abfd == NULL)
return NULL;
@@ -288,6 +322,7 @@ int dso__load_bfd_symbols(struct dso *dso, const char *debugfile)
bfd *abfd;
u64 start, len;
+ ensure_bfd_init();
abfd = bfd_openr(debugfile, NULL);
if (!abfd)
return -1;
@@ -393,6 +428,7 @@ int libbfd__read_build_id(const char *filename, struct build_id *bid, bool block
if (fd < 0)
return -1;
+ ensure_bfd_init();
abfd = bfd_fdopenr(filename, /*target=*/NULL, fd);
if (!abfd)
return -1;
@@ -421,6 +457,7 @@ int libbfd_filename__read_debuglink(const char *filename, char *debuglink,
asection *section;
bfd *abfd;
+ ensure_bfd_init();
abfd = bfd_openr(filename, NULL);
if (!abfd)
return -1;
@@ -480,6 +517,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused,
memset(tpath, 0, sizeof(tpath));
perf_exe(tpath, sizeof(tpath));
+ ensure_bfd_init();
bfdf = bfd_openr(tpath, NULL);
if (bfdf == NULL)
abort();
diff --git a/tools/perf/util/mutex.c b/tools/perf/util/mutex.c
index bca7f0717f35..7aa1f3f55a7d 100644
--- a/tools/perf/util/mutex.c
+++ b/tools/perf/util/mutex.c
@@ -17,7 +17,7 @@ static void check_err(const char *fn, int err)
#define CHECK_ERR(err) check_err(__func__, err)
-static void __mutex_init(struct mutex *mtx, bool pshared)
+static void __mutex_init(struct mutex *mtx, bool pshared, bool recursive)
{
pthread_mutexattr_t attr;
@@ -27,21 +27,27 @@ static void __mutex_init(struct mutex *mtx, bool pshared)
/* In normal builds enable error checking, such as recursive usage. */
CHECK_ERR(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK));
#endif
+ if (recursive)
+ CHECK_ERR(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE));
if (pshared)
CHECK_ERR(pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED));
-
CHECK_ERR(pthread_mutex_init(&mtx->lock, &attr));
CHECK_ERR(pthread_mutexattr_destroy(&attr));
}
void mutex_init(struct mutex *mtx)
{
- __mutex_init(mtx, /*pshared=*/false);
+ __mutex_init(mtx, /*pshared=*/false, /*recursive=*/false);
}
void mutex_init_pshared(struct mutex *mtx)
{
- __mutex_init(mtx, /*pshared=*/true);
+ __mutex_init(mtx, /*pshared=*/true, /*recursive=*/false);
+}
+
+void mutex_init_recursive(struct mutex *mtx)
+{
+ __mutex_init(mtx, /*pshared=*/false, /*recursive=*/true);
}
void mutex_destroy(struct mutex *mtx)
diff --git a/tools/perf/util/mutex.h b/tools/perf/util/mutex.h
index 38458f00846f..70232d8d094f 100644
--- a/tools/perf/util/mutex.h
+++ b/tools/perf/util/mutex.h
@@ -104,6 +104,8 @@ void mutex_init(struct mutex *mtx);
* process-private attribute.
*/
void mutex_init_pshared(struct mutex *mtx);
+/* Initializes a mutex that may be recursively held on the same thread. */
+void mutex_init_recursive(struct mutex *mtx);
void mutex_destroy(struct mutex *mtx);
void mutex_lock(struct mutex *mtx) EXCLUSIVE_LOCK_FUNCTION(*mtx);
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index cc26b7bf302b..948d3e8ad782 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -112,9 +112,13 @@ static bool symbol_type__filter(char __symbol_type)
// 'N' first seen in:
// ffffffff9b35d130 N __pfx__RNCINvNtNtNtCsbDUBuN8AbD4_4core4iter8adapters3map12map_try_foldjNtCs6vVzKs5jPr6_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_
// a seemingly Rust mangled name
+ // Ditto for '1':
+ // root@x1:~# grep ' 1 ' /proc/kallsyms
+ // ffffffffb098bc00 1 __pfx__RNCINvNtNtNtCsfwaGRd4cjqE_4core4iter8adapters3map12map_try_foldjNtCskFudTml27HW_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_
+ // ffffffffb098bc10 1 _RNCINvNtNtNtCsfwaGRd4cjqE_4core4iter8adapters3map12map_try_foldjNtCskFudTml27HW_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_
char symbol_type = toupper(__symbol_type);
return symbol_type == 'T' || symbol_type == 'W' || symbol_type == 'D' || symbol_type == 'B' ||
- __symbol_type == 'u' || __symbol_type == 'l' || __symbol_type == 'N';
+ __symbol_type == 'u' || __symbol_type == 'l' || __symbol_type == 'N' || __symbol_type == '1';
}
static int prefix_underscores_count(const char *str)
diff --git a/tools/power/acpi/tools/pfrut/pfrut.c b/tools/power/acpi/tools/pfrut/pfrut.c
index 44a9ecbd91e8..4d9b0177c312 100644
--- a/tools/power/acpi/tools/pfrut/pfrut.c
+++ b/tools/power/acpi/tools/pfrut/pfrut.c
@@ -222,6 +222,7 @@ int main(int argc, char *argv[])
fd_update_log = open("/dev/acpi_pfr_telemetry0", O_RDWR);
if (fd_update_log < 0) {
printf("PFRT device not supported - Quit...\n");
+ close(fd_update);
return 1;
}
@@ -265,7 +266,8 @@ int main(int argc, char *argv[])
printf("chunk2_size:%d\n", data_info.chunk2_size);
printf("rollover_cnt:%d\n", data_info.rollover_cnt);
printf("reset_cnt:%d\n", data_info.reset_cnt);
-
+ close(fd_update);
+ close(fd_update_log);
return 0;
}
@@ -358,6 +360,7 @@ int main(int argc, char *argv[])
if (ret == -1) {
perror("Failed to load capsule file");
+ munmap(addr_map_capsule, st.st_size);
close(fd_capsule);
close(fd_update);
close(fd_update_log);
@@ -420,7 +423,7 @@ int main(int argc, char *argv[])
if (p_mmap == MAP_FAILED) {
perror("mmap error.");
close(fd_update_log);
-
+ free(log_buf);
return 1;
}
diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile
index c43db1c41205..a1df9196dc45 100644
--- a/tools/power/cpupower/Makefile
+++ b/tools/power/cpupower/Makefile
@@ -37,9 +37,7 @@ NLS ?= true
# cpufreq-bench benchmarking tool
CPUFREQ_BENCH ?= true
-# Do not build libraries, but build the code in statically
-# Libraries are still built, otherwise the Makefile code would
-# be rather ugly.
+# Build the code, including libraries, statically.
export STATIC ?= false
# Prefix to the directories we're installing to
@@ -207,14 +205,25 @@ $(OUTPUT)lib/%.o: $(LIB_SRC) $(LIB_HEADERS)
$(ECHO) " CC " $@
$(QUIET) $(CC) $(CFLAGS) -fPIC -o $@ -c lib/$*.c
-$(OUTPUT)libcpupower.so.$(LIB_VER): $(LIB_OBJS)
+ifeq ($(strip $(STATIC)),true)
+LIBCPUPOWER := libcpupower.a
+else
+LIBCPUPOWER := libcpupower.so.$(LIB_VER)
+endif
+
+$(OUTPUT)$(LIBCPUPOWER): $(LIB_OBJS)
+ifeq ($(strip $(STATIC)),true)
+ $(ECHO) " AR " $@
+ $(QUIET) $(AR) rcs $@ $(LIB_OBJS)
+else
$(ECHO) " LD " $@
$(QUIET) $(CC) -shared $(CFLAGS) $(LDFLAGS) -o $@ \
-Wl,-soname,libcpupower.so.$(LIB_MAJ) $(LIB_OBJS)
@ln -sf $(@F) $(OUTPUT)libcpupower.so
@ln -sf $(@F) $(OUTPUT)libcpupower.so.$(LIB_MAJ)
+endif
-libcpupower: $(OUTPUT)libcpupower.so.$(LIB_VER)
+libcpupower: $(OUTPUT)$(LIBCPUPOWER)
# Let all .o files depend on its .c file and all headers
# Might be worth to put this into utils/Makefile at some point of time
@@ -224,7 +233,7 @@ $(OUTPUT)%.o: %.c
$(ECHO) " CC " $@
$(QUIET) $(CC) $(CFLAGS) -I./lib -I ./utils -o $@ -c $*.c
-$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)libcpupower.so.$(LIB_VER)
+$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)$(LIBCPUPOWER)
$(ECHO) " CC " $@
ifeq ($(strip $(STATIC)),true)
$(QUIET) $(CC) $(CFLAGS) $(LDFLAGS) $(UTIL_OBJS) -lrt -lpci -L$(OUTPUT) -o $@
@@ -269,7 +278,7 @@ update-po: $(OUTPUT)po/$(PACKAGE).pot
done;
endif
-compile-bench: $(OUTPUT)libcpupower.so.$(LIB_VER)
+compile-bench: $(OUTPUT)$(LIBCPUPOWER)
@V=$(V) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT)
# we compile into subdirectories. if the target directory is not the
@@ -287,6 +296,7 @@ clean:
-find $(OUTPUT) \( -not -type d \) -and \( -name '*~' -o -name '*.[oas]' \) -type f -print \
| xargs rm -f
-rm -f $(OUTPUT)cpupower
+ -rm -f $(OUTPUT)libcpupower.a
-rm -f $(OUTPUT)libcpupower.so*
-rm -rf $(OUTPUT)po/*.gmo
-rm -rf $(OUTPUT)po/*.pot
@@ -295,7 +305,11 @@ clean:
install-lib: libcpupower
$(INSTALL) -d $(DESTDIR)${libdir}
+ifeq ($(strip $(STATIC)),true)
+ $(CP) $(OUTPUT)libcpupower.a $(DESTDIR)${libdir}/
+else
$(CP) $(OUTPUT)libcpupower.so* $(DESTDIR)${libdir}/
+endif
$(INSTALL) -d $(DESTDIR)${includedir}
$(INSTALL_DATA) lib/cpufreq.h $(DESTDIR)${includedir}/cpufreq.h
$(INSTALL_DATA) lib/cpuidle.h $(DESTDIR)${includedir}/cpuidle.h
@@ -336,11 +350,7 @@ install-bench: compile-bench
@#DESTDIR must be set from outside to survive
@sbindir=$(sbindir) bindir=$(bindir) docdir=$(docdir) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT) install
-ifeq ($(strip $(STATIC)),true)
-install: all install-tools install-man $(INSTALL_NLS) $(INSTALL_BENCH)
-else
install: all install-lib install-tools install-man $(INSTALL_NLS) $(INSTALL_BENCH)
-endif
uninstall:
- rm -f $(DESTDIR)${libdir}/libcpupower.*
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index d68780e2e03d..e4bda2474060 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -133,6 +133,7 @@ $(MAKE_DIRS):
$(call msg,MKDIR,,$@)
$(Q)mkdir -p $@
+ifneq ($(CROSS_COMPILE),)
$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
$(APIDIR)/linux/bpf.h \
| $(OBJ_DIR)/libbpf
@@ -141,6 +142,7 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
EXTRA_CFLAGS='-g -O0 -fPIC' \
LDFLAGS="$(LDFLAGS)" \
DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
+endif
$(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
$(APIDIR)/linux/bpf.h \
@@ -187,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
-c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg
+c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg
$(addprefix $(BINDIR)/,$(c-sched-targets)): \
$(BINDIR)/%: \
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 06e2551033cb..821d5791bd42 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -60,21 +60,15 @@ static inline void ___vmlinux_h_sanity_check___(void)
s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
-s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
- const struct cpumask *cpus_allowed, u64 flags) __ksym __weak;
-void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
-void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
+s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
+ struct scx_bpf_select_cpu_and_args *args) __ksym __weak;
+bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, struct scx_bpf_dsq_insert_vtime_args *args) __ksym __weak;
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
void scx_bpf_dispatch_cancel(void) __ksym;
-bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak;
-void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
-void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
-bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
-bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
-u32 scx_bpf_reenqueue_local(void) __ksym;
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak;
int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
@@ -105,7 +99,6 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
struct rq *scx_bpf_locked_rq(void) __ksym;
struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
-struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
u64 scx_bpf_now(void) __ksym __weak;
void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index dd9144624dc9..f2969c3061a7 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -16,119 +16,92 @@
})
/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */
-#define __COMPAT_scx_bpf_task_cgroup(p) \
- (bpf_ksym_exists(scx_bpf_task_cgroup) ? \
- scx_bpf_task_cgroup((p)) : NULL)
+struct cgroup *scx_bpf_task_cgroup___new(struct task_struct *p) __ksym __weak;
+
+#define scx_bpf_task_cgroup(p) \
+ (bpf_ksym_exists(scx_bpf_task_cgroup___new) ? \
+ scx_bpf_task_cgroup___new((p)) : NULL)
/*
* v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are
* renamed to unload the verb.
*
- * Build error is triggered if old names are used. New binaries work with both
- * new and old names. The compat macros will be removed on v6.15 release.
- *
* scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by
* 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()").
- * Preserve __COMPAT macros until v6.15.
*/
-void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
-void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
-bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak;
-void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
-void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
-bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
-bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
-int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
-
-#define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \
- (bpf_ksym_exists(scx_bpf_dsq_insert) ? \
- scx_bpf_dsq_insert((p), (dsq_id), (slice), (enq_flags)) : \
- scx_bpf_dispatch___compat((p), (dsq_id), (slice), (enq_flags)))
-
-#define scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags) \
- (bpf_ksym_exists(scx_bpf_dsq_insert_vtime) ? \
- scx_bpf_dsq_insert_vtime((p), (dsq_id), (slice), (vtime), (enq_flags)) : \
- scx_bpf_dispatch_vtime___compat((p), (dsq_id), (slice), (vtime), (enq_flags)))
+bool scx_bpf_dsq_move_to_local___new(u64 dsq_id) __ksym __weak;
+void scx_bpf_dsq_move_set_slice___new(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
+void scx_bpf_dsq_move_set_vtime___new(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
+bool scx_bpf_dsq_move___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dsq_move_vtime___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+
+bool scx_bpf_consume___old(u64 dsq_id) __ksym __weak;
+void scx_bpf_dispatch_from_dsq_set_slice___old(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
+void scx_bpf_dispatch_from_dsq_set_vtime___old(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
+bool scx_bpf_dispatch_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dispatch_vtime_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
#define scx_bpf_dsq_move_to_local(dsq_id) \
- (bpf_ksym_exists(scx_bpf_dsq_move_to_local) ? \
- scx_bpf_dsq_move_to_local((dsq_id)) : \
- scx_bpf_consume___compat((dsq_id)))
-
-#define __COMPAT_scx_bpf_dsq_move_set_slice(it__iter, slice) \
- (bpf_ksym_exists(scx_bpf_dsq_move_set_slice) ? \
- scx_bpf_dsq_move_set_slice((it__iter), (slice)) : \
- (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___compat) ? \
- scx_bpf_dispatch_from_dsq_set_slice___compat((it__iter), (slice)) : \
+ (bpf_ksym_exists(scx_bpf_dsq_move_to_local___new) ? \
+ scx_bpf_dsq_move_to_local___new((dsq_id)) : \
+ scx_bpf_consume___old((dsq_id)))
+
+#define scx_bpf_dsq_move_set_slice(it__iter, slice) \
+ (bpf_ksym_exists(scx_bpf_dsq_move_set_slice___new) ? \
+ scx_bpf_dsq_move_set_slice___new((it__iter), (slice)) : \
+ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___old) ? \
+ scx_bpf_dispatch_from_dsq_set_slice___old((it__iter), (slice)) : \
+ (void)0))
+
+#define scx_bpf_dsq_move_set_vtime(it__iter, vtime) \
+ (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime___new) ? \
+ scx_bpf_dsq_move_set_vtime___new((it__iter), (vtime)) : \
+ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___old) ? \
+ scx_bpf_dispatch_from_dsq_set_vtime___old((it__iter), (vtime)) : \
(void)0))
-#define __COMPAT_scx_bpf_dsq_move_set_vtime(it__iter, vtime) \
- (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime) ? \
- scx_bpf_dsq_move_set_vtime((it__iter), (vtime)) : \
- (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___compat) ? \
- scx_bpf_dispatch_from_dsq_set_vtime___compat((it__iter), (vtime)) : \
- (void) 0))
-
-#define __COMPAT_scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \
- (bpf_ksym_exists(scx_bpf_dsq_move) ? \
- scx_bpf_dsq_move((it__iter), (p), (dsq_id), (enq_flags)) : \
- (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___compat) ? \
- scx_bpf_dispatch_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
+#define scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \
+ (bpf_ksym_exists(scx_bpf_dsq_move___new) ? \
+ scx_bpf_dsq_move___new((it__iter), (p), (dsq_id), (enq_flags)) : \
+ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___old) ? \
+ scx_bpf_dispatch_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \
false))
-#define __COMPAT_scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \
- (bpf_ksym_exists(scx_bpf_dsq_move_vtime) ? \
- scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) : \
- (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___compat) ? \
- scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
+#define scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \
+ (bpf_ksym_exists(scx_bpf_dsq_move_vtime___new) ? \
+ scx_bpf_dsq_move_vtime___new((it__iter), (p), (dsq_id), (enq_flags)) : \
+ (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___old) ? \
+ scx_bpf_dispatch_vtime_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \
false))
+/*
+ * v6.15: 950ad93df2fc ("bpf: add kfunc for populating cpumask bits")
+ *
+ * Compat macro will be dropped on v6.19 release.
+ */
+int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
+
#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \
(bpf_ksym_exists(bpf_cpumask_populate) ? \
(bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP)
-#define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \
- _Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()")
-
-#define scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, enq_flags) \
- _Static_assert(false, "scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()")
-
-#define scx_bpf_consume(dsq_id) ({ \
- _Static_assert(false, "scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); \
- false; \
-})
-
-#define scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \
- _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()")
-
-#define scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \
- _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()")
-
-#define scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
- _Static_assert(false, "scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); \
- false; \
-})
-
-#define scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
- _Static_assert(false, "scx_bpf_dispatch_vtime_from_dsq() renamed to scx_bpf_dsq_move_vtime()"); \
- false; \
-})
-
-#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \
- _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_slice() renamed to __COMPAT_scx_bpf_dsq_move_set_slice()")
-
-#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \
- _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime() renamed to __COMPAT_scx_bpf_dsq_move_set_vtime()")
-
-#define __COMPAT_scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
- _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move()"); \
- false; \
-})
-
-#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
- _Static_assert(false, "__COMPAT_scx_bpf_dispatch_vtime_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move_vtime()"); \
- false; \
-})
+/*
+ * v6.19: Introduce lockless peek API for user DSQs.
+ *
+ * Preserve the following macro until v6.21.
+ */
+static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id)
+{
+ struct task_struct *p = NULL;
+ struct bpf_iter_scx_dsq it;
+
+ if (bpf_ksym_exists(scx_bpf_dsq_peek))
+ return scx_bpf_dsq_peek(dsq_id);
+ if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0))
+ p = bpf_iter_scx_dsq_next(&it);
+ bpf_iter_scx_dsq_destroy(&it);
+ return p;
+}
/**
* __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
@@ -248,6 +221,161 @@ static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu)
}
/*
+ * v6.19: To work around BPF maximum parameter limit, the following kfuncs are
+ * replaced with variants that pack scalar arguments in a struct. Wrappers are
+ * provided to maintain source compatibility.
+ *
+ * v6.13: scx_bpf_dsq_insert_vtime() renaming is also handled here. See the
+ * block on dispatch renaming above for more details.
+ *
+ * The kernel will carry the compat variants until v6.23 to maintain binary
+ * compatibility. After v6.23 release, remove the compat handling and move the
+ * wrappers to common.bpf.h.
+ */
+s32 scx_bpf_select_cpu_and___compat(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags) __ksym __weak;
+void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
+void scx_bpf_dsq_insert_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
+
+/**
+ * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @cpus_allowed: cpumask of allowed CPUs
+ * @flags: %SCX_PICK_IDLE* flags
+ *
+ * Inline wrapper that packs scalar arguments into a struct and calls
+ * __scx_bpf_select_cpu_and(). See __scx_bpf_select_cpu_and() for details.
+ */
+static inline s32
+scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
+{
+ if (bpf_core_type_exists(struct scx_bpf_select_cpu_and_args)) {
+ struct scx_bpf_select_cpu_and_args args = {
+ .prev_cpu = prev_cpu,
+ .wake_flags = wake_flags,
+ .flags = flags,
+ };
+
+ return __scx_bpf_select_cpu_and(p, cpus_allowed, &args);
+ } else {
+ return scx_bpf_select_cpu_and___compat(p, prev_cpu, wake_flags,
+ cpus_allowed, flags);
+ }
+}
+
+/**
+ * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
+ * @p: task_struct to insert
+ * @dsq_id: DSQ to insert into
+ * @slice: duration @p can run for in nsecs, 0 to keep the current value
+ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Inline wrapper that packs scalar arguments into a struct and calls
+ * __scx_bpf_dsq_insert_vtime(). See __scx_bpf_dsq_insert_vtime() for details.
+ */
+static inline bool
+scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime,
+ u64 enq_flags)
+{
+ if (bpf_core_type_exists(struct scx_bpf_dsq_insert_vtime_args)) {
+ struct scx_bpf_dsq_insert_vtime_args args = {
+ .dsq_id = dsq_id,
+ .slice = slice,
+ .vtime = vtime,
+ .enq_flags = enq_flags,
+ };
+
+ return __scx_bpf_dsq_insert_vtime(p, &args);
+ } else if (bpf_ksym_exists(scx_bpf_dsq_insert_vtime___compat)) {
+ scx_bpf_dsq_insert_vtime___compat(p, dsq_id, slice, vtime,
+ enq_flags);
+ return true;
+ } else {
+ scx_bpf_dispatch_vtime___compat(p, dsq_id, slice, vtime,
+ enq_flags);
+ return true;
+ }
+}
+
+/*
+ * v6.19: scx_bpf_dsq_insert() now returns bool instead of void. Move
+ * scx_bpf_dsq_insert() decl to common.bpf.h and drop compat helper after v6.22.
+ * The extra ___compat suffix is to work around libbpf not ignoring __SUFFIX on
+ * kernel side. The entire suffix can be dropped later.
+ *
+ * v6.13: scx_bpf_dsq_insert() renaming is also handled here. See the block on
+ * dispatch renaming above for more details.
+ */
+bool scx_bpf_dsq_insert___v2___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
+void scx_bpf_dsq_insert___v1(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
+void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
+
+static inline bool
+scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags)
+{
+ if (bpf_ksym_exists(scx_bpf_dsq_insert___v2___compat)) {
+ return scx_bpf_dsq_insert___v2___compat(p, dsq_id, slice, enq_flags);
+ } else if (bpf_ksym_exists(scx_bpf_dsq_insert___v1)) {
+ scx_bpf_dsq_insert___v1(p, dsq_id, slice, enq_flags);
+ return true;
+ } else {
+ scx_bpf_dispatch___compat(p, dsq_id, slice, enq_flags);
+ return true;
+ }
+}
+
+/*
+ * v6.19: scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() added to for
+ * sub-sched authority checks. Drop the wrappers and move the decls to
+ * common.bpf.h after v6.22.
+ */
+bool scx_bpf_task_set_slice___new(struct task_struct *p, u64 slice) __ksym __weak;
+bool scx_bpf_task_set_dsq_vtime___new(struct task_struct *p, u64 vtime) __ksym __weak;
+
+static inline void scx_bpf_task_set_slice(struct task_struct *p, u64 slice)
+{
+ if (bpf_ksym_exists(scx_bpf_task_set_slice___new))
+ scx_bpf_task_set_slice___new(p, slice);
+ else
+ p->scx.slice = slice;
+}
+
+static inline void scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime)
+{
+ if (bpf_ksym_exists(scx_bpf_task_set_dsq_vtime___new))
+ scx_bpf_task_set_dsq_vtime___new(p, vtime);
+ else
+ p->scx.dsq_vtime = vtime;
+}
+
+/*
+ * v6.19: The new void variant can be called from anywhere while the older v1
+ * variant can only be called from ops.cpu_release(). The double ___ prefixes on
+ * the v2 variant need to be removed once libbpf is updated to ignore ___ prefix
+ * on kernel side. Drop the wrapper and move the decl to common.bpf.h after
+ * v6.22.
+ */
+u32 scx_bpf_reenqueue_local___v1(void) __ksym __weak;
+void scx_bpf_reenqueue_local___v2___compat(void) __ksym __weak;
+
+static inline bool __COMPAT_scx_bpf_reenqueue_local_from_anywhere(void)
+{
+ return bpf_ksym_exists(scx_bpf_reenqueue_local___v2___compat);
+}
+
+static inline void scx_bpf_reenqueue_local(void)
+{
+ if (__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
+ scx_bpf_reenqueue_local___v2___compat();
+ else
+ scx_bpf_reenqueue_local___v1();
+}
+
+/*
* Define sched_ext_ops. This may be expanded to define multiple variants for
* backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
*/
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index 35c67c5174ac..8b4897fc8b99 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -151,6 +151,10 @@ static inline long scx_hotplug_seq(void)
*
* ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
* the current minimum required kernel version.
+ *
+ * COMPAT:
+ * - v6.17: ops.cgroup_set_bandwidth()
+ * - v6.19: ops.cgroup_set_idle()
*/
#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \
struct __scx_name *__skel; \
@@ -162,6 +166,16 @@ static inline long scx_hotplug_seq(void)
SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \
__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \
SCX_ENUM_INIT(__skel); \
+ if (__skel->struct_ops.__ops_name->cgroup_set_bandwidth && \
+ !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_bandwidth")) { \
+ fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_bandwidth()\n"); \
+ __skel->struct_ops.__ops_name->cgroup_set_bandwidth = NULL; \
+ } \
+ if (__skel->struct_ops.__ops_name->cgroup_set_idle && \
+ !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_idle")) { \
+ fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_idle()\n"); \
+ __skel->struct_ops.__ops_name->cgroup_set_idle = NULL; \
+ } \
__skel; \
})
diff --git a/tools/sched_ext/scx_cpu0.bpf.c b/tools/sched_ext/scx_cpu0.bpf.c
new file mode 100644
index 000000000000..6326ce598c8e
--- /dev/null
+++ b/tools/sched_ext/scx_cpu0.bpf.c
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A CPU0 scheduler.
+ *
+ * This scheduler queues all tasks to a shared DSQ and only dispatches them on
+ * CPU0 in FIFO order. This is useful for testing bypass behavior when many
+ * tasks are concentrated on a single CPU. If the load balancer doesn't work,
+ * bypass mode can trigger task hangs or RCU stalls as the queue is long and
+ * there's only one CPU working on it.
+ *
+ * - Statistics tracking how many tasks are queued to local and CPU0 DSQs.
+ * - Termination notification for userspace.
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */
+
+UEI_DEFINE(uei);
+
+/*
+ * We create a custom DSQ with ID 0 that we dispatch to and consume from on
+ * CPU0.
+ */
+#define DSQ_CPU0 0
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u64));
+ __uint(max_entries, 2); /* [local, cpu0] */
+} stats SEC(".maps");
+
+static void stat_inc(u32 idx)
+{
+ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
+ if (cnt_p)
+ (*cnt_p)++;
+}
+
+s32 BPF_STRUCT_OPS(cpu0_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+ return 0;
+}
+
+void BPF_STRUCT_OPS(cpu0_enqueue, struct task_struct *p, u64 enq_flags)
+{
+ /*
+ * select_cpu() always picks CPU0. If @p is not on CPU0, it can't run on
+ * CPU 0. Queue on whichever CPU it's currently only.
+ */
+ if (scx_bpf_task_cpu(p) != 0) {
+ stat_inc(0); /* count local queueing */
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+ return;
+ }
+
+ stat_inc(1); /* count cpu0 queueing */
+ scx_bpf_dsq_insert(p, DSQ_CPU0, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev)
+{
+ if (cpu == 0)
+ scx_bpf_dsq_move_to_local(DSQ_CPU0);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init)
+{
+ return scx_bpf_create_dsq(DSQ_CPU0, -1);
+}
+
+void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(cpu0_ops,
+ .select_cpu = (void *)cpu0_select_cpu,
+ .enqueue = (void *)cpu0_enqueue,
+ .dispatch = (void *)cpu0_dispatch,
+ .init = (void *)cpu0_init,
+ .exit = (void *)cpu0_exit,
+ .name = "cpu0");
diff --git a/tools/sched_ext/scx_cpu0.c b/tools/sched_ext/scx_cpu0.c
new file mode 100644
index 000000000000..1e4fa4ab8da9
--- /dev/null
+++ b/tools/sched_ext/scx_cpu0.c
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_cpu0.bpf.skel.h"
+
+const char help_fmt[] =
+"A cpu0 sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-v]\n"
+"\n"
+" -v Print libbpf debug messages\n"
+" -h Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+ if (level == LIBBPF_DEBUG && !verbose)
+ return 0;
+ return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int sig)
+{
+ exit_req = 1;
+}
+
+static void read_stats(struct scx_cpu0 *skel, __u64 *stats)
+{
+ int nr_cpus = libbpf_num_possible_cpus();
+ assert(nr_cpus > 0);
+ __u64 cnts[2][nr_cpus];
+ __u32 idx;
+
+ memset(stats, 0, sizeof(stats[0]) * 2);
+
+ for (idx = 0; idx < 2; idx++) {
+ int ret, cpu;
+
+ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
+ &idx, cnts[idx]);
+ if (ret < 0)
+ continue;
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ stats[idx] += cnts[idx][cpu];
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct scx_cpu0 *skel;
+ struct bpf_link *link;
+ __u32 opt;
+ __u64 ecode;
+
+ libbpf_set_print(libbpf_print_fn);
+ signal(SIGINT, sigint_handler);
+ signal(SIGTERM, sigint_handler);
+restart:
+ skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0);
+
+ skel->rodata->nr_cpus = libbpf_num_possible_cpus();
+
+ while ((opt = getopt(argc, argv, "vh")) != -1) {
+ switch (opt) {
+ case 'v':
+ verbose = true;
+ break;
+ default:
+ fprintf(stderr, help_fmt, basename(argv[0]));
+ return opt != 'h';
+ }
+ }
+
+ SCX_OPS_LOAD(skel, cpu0_ops, scx_cpu0, uei);
+ link = SCX_OPS_ATTACH(skel, cpu0_ops, scx_cpu0);
+
+ while (!exit_req && !UEI_EXITED(skel, uei)) {
+ __u64 stats[2];
+
+ read_stats(skel, stats);
+ printf("local=%llu cpu0=%llu\n", stats[0], stats[1]);
+ fflush(stdout);
+ sleep(1);
+ }
+
+ bpf_link__destroy(link);
+ ecode = UEI_REPORT(skel, uei);
+ scx_cpu0__destroy(skel);
+
+ if (UEI_ECODE_RESTART(ecode))
+ goto restart;
+ return 0;
+}
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 2c720e3ecad5..43126858b8e4 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -382,7 +382,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
- cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+ cgrp = scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp);
if (!cgc)
goto out_release;
@@ -508,7 +508,7 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
{
struct cgroup *cgrp;
- cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+ cgrp = scx_bpf_task_cgroup(p);
update_active_weight_sums(cgrp, true);
bpf_cgroup_release(cgrp);
}
@@ -521,7 +521,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
if (fifo_sched)
return;
- cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+ cgrp = scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp);
if (cgc) {
/*
@@ -564,7 +564,7 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
if (!taskc->bypassed_at)
return;
- cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+ cgrp = scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp);
if (cgc) {
__sync_fetch_and_add(&cgc->cvtime_delta,
@@ -578,7 +578,7 @@ void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
{
struct cgroup *cgrp;
- cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+ cgrp = scx_bpf_task_cgroup(p);
update_active_weight_sums(cgrp, false);
bpf_cgroup_release(cgrp);
}
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 3072b593f898..df21fad0c438 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -202,6 +202,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
void *ring;
s32 cpu;
+ if (enq_flags & SCX_ENQ_REENQ)
+ __sync_fetch_and_add(&nr_reenqueued, 1);
+
if (p->flags & PF_KTHREAD) {
if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
return;
@@ -320,12 +323,9 @@ static bool dispatch_highpri(bool from_timer)
if (tctx->highpri) {
/* exercise the set_*() and vtime interface too */
- __COMPAT_scx_bpf_dsq_move_set_slice(
- BPF_FOR_EACH_ITER, slice_ns * 2);
- __COMPAT_scx_bpf_dsq_move_set_vtime(
- BPF_FOR_EACH_ITER, highpri_seq++);
- __COMPAT_scx_bpf_dsq_move_vtime(
- BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
+ scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
+ scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
+ scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
}
}
@@ -342,9 +342,8 @@ static bool dispatch_highpri(bool from_timer)
else
cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
- if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
- SCX_DSQ_LOCAL_ON | cpu,
- SCX_ENQ_PREEMPT)) {
+ if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
+ SCX_ENQ_PREEMPT)) {
if (cpu == this_cpu) {
dispatched = true;
__sync_fetch_and_add(&nr_expedited_local, 1);
@@ -533,20 +532,35 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
return task_qdist(a) > task_qdist(b);
}
-void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
+SEC("tp_btf/sched_switch")
+int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev,
+ struct task_struct *next, unsigned long prev_state)
{
- u32 cnt;
+ if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
+ return 0;
/*
- * Called when @cpu is taken by a higher priority scheduling class. This
- * makes @cpu no longer available for executing sched_ext tasks. As we
- * don't want the tasks in @cpu's local dsq to sit there until @cpu
- * becomes available again, re-enqueue them into the global dsq. See
- * %SCX_ENQ_REENQ handling in qmap_enqueue().
+ * If @cpu is taken by a higher priority scheduling class, it is no
+ * longer available for executing sched_ext tasks. As we don't want the
+ * tasks in @cpu's local dsq to sit there until @cpu becomes available
+ * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ
+ * handling in qmap_enqueue().
*/
- cnt = scx_bpf_reenqueue_local();
- if (cnt)
- __sync_fetch_and_add(&nr_reenqueued, cnt);
+ switch (next->policy) {
+ case 1: /* SCHED_FIFO */
+ case 2: /* SCHED_RR */
+ case 6: /* SCHED_DEADLINE */
+ scx_bpf_reenqueue_local();
+ }
+
+ return 0;
+}
+
+void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
+{
+ /* see qmap_sched_switch() to learn how to do this on newer kernels */
+ if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
+ scx_bpf_reenqueue_local();
}
s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c
index a85c19e9524e..0114108ab25f 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c
@@ -1071,7 +1071,7 @@ static bool sve_write_supported(struct test_config *config)
static bool sve_write_fpsimd_supported(struct test_config *config)
{
- if (!sve_supported())
+ if (!sve_supported() && !sme_supported())
return false;
if ((config->svcr_in & SVCR_ZA) != (config->svcr_expected & SVCR_ZA))
@@ -1231,9 +1231,6 @@ static void sve_write_fpsimd(pid_t child, struct test_config *config)
vl = vl_expected(config);
vq = __sve_vq_from_vl(vl);
- if (!vl)
- return;
-
iov.iov_len = SVE_PT_SIZE(vq, SVE_PT_REGS_FPSIMD);
iov.iov_base = malloc(iov.iov_len);
if (!iov.iov_base) {
diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
index e0fc3a001e28..f44d44618575 100644
--- a/tools/testing/selftests/arm64/fp/sve-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -394,6 +394,58 @@ out:
free(svebuf);
}
+/* Write the FPSIMD registers via the SVE regset when SVE is not supported */
+static void ptrace_sve_fpsimd_no_sve(pid_t child)
+{
+ void *svebuf;
+ struct user_sve_header *sve;
+ struct user_fpsimd_state *fpsimd, new_fpsimd;
+ unsigned int i, j;
+ unsigned char *p;
+ int ret;
+
+ svebuf = malloc(SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD));
+ if (!svebuf) {
+ ksft_test_result_fail("Failed to allocate FPSIMD buffer\n");
+ return;
+ }
+
+ /* On a system without SVE the VL should be set to 0 */
+ memset(svebuf, 0, SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD));
+ sve = svebuf;
+ sve->flags = SVE_PT_REGS_FPSIMD;
+ sve->size = SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD);
+ sve->vl = 0;
+
+ /* Try to set a known FPSIMD state via PT_REGS_SVE */
+ fpsimd = (struct user_fpsimd_state *)((char *)sve +
+ SVE_PT_FPSIMD_OFFSET);
+ for (i = 0; i < 32; ++i) {
+ p = (unsigned char *)&fpsimd->vregs[i];
+
+ for (j = 0; j < sizeof(fpsimd->vregs[i]); ++j)
+ p[j] = j;
+ }
+
+ ret = set_sve(child, &vec_types[0], sve);
+ ksft_test_result(ret == 0, "FPSIMD write via SVE\n");
+ if (ret) {
+ ksft_test_result_skip("Verify FPSIMD write via SVE\n");
+ goto out;
+ }
+
+ /* Verify via the FPSIMD regset */
+ if (get_fpsimd(child, &new_fpsimd)) {
+ ksft_test_result_skip("Verify FPSIMD write via SVE\n");
+ goto out;
+ }
+ ksft_test_result(memcmp(fpsimd, &new_fpsimd, sizeof(*fpsimd)) == 0,
+ "Verify FPSIMD write via SVE\n");
+
+out:
+ free(svebuf);
+}
+
/* Validate attempting to set SVE data and read SVE data */
static void ptrace_set_sve_get_sve_data(pid_t child,
const struct vec_type *type,
@@ -826,6 +878,15 @@ static int do_parent(pid_t child)
}
}
+ /* We support SVE writes of FPSMID format on SME only systems */
+ if (!(getauxval(AT_HWCAP) & HWCAP_SVE) &&
+ (getauxval(AT_HWCAP2) & HWCAP2_SME)) {
+ ptrace_sve_fpsimd_no_sve(child);
+ } else {
+ ksft_test_result_skip("FPSIMD write via SVE\n");
+ ksft_test_result_skip("Verify FPSIMD write via SVE\n");
+ }
+
ret = EXIT_SUCCESS;
error:
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
index 38080f3c3280..a8df05771670 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -276,7 +276,7 @@ function barf
bl putdec
puts ", iteration="
mov x0, x22
- bl putdec
+ bl putdecn
puts "\tExpected ["
mov x0, x10
mov x1, x12
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 70b28c1e653e..f2a2fd236ca8 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -50,6 +50,7 @@ CONFIG_IPV6_SIT=y
CONFIG_IPV6_TUNNEL=y
CONFIG_KEYS=y
CONFIG_LIRC=y
+CONFIG_LIVEPATCH=y
CONFIG_LWTUNNEL=y
CONFIG_MODULE_SIG=y
CONFIG_MODULE_SRCVERSION_ALL=y
@@ -111,6 +112,8 @@ CONFIG_IP6_NF_FILTER=y
CONFIG_NF_NAT=y
CONFIG_PACKET=y
CONFIG_RC_CORE=y
+CONFIG_SAMPLES=y
+CONFIG_SAMPLE_LIVEPATCH=m
CONFIG_SECURITY=y
CONFIG_SECURITYFS=y
CONFIG_SYN_COOKIES=y
diff --git a/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c b/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c
new file mode 100644
index 000000000000..72aa5376c30e
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include "testing_helpers.h"
+#include "livepatch_trampoline.skel.h"
+
+static int load_livepatch(void)
+{
+ char path[4096];
+
+ /* CI will set KBUILD_OUTPUT */
+ snprintf(path, sizeof(path), "%s/samples/livepatch/livepatch-sample.ko",
+ getenv("KBUILD_OUTPUT") ? : "../../../..");
+
+ return load_module(path, env_verbosity > VERBOSE_NONE);
+}
+
+static void unload_livepatch(void)
+{
+ /* Disable the livepatch before unloading the module */
+ system("echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled");
+
+ unload_module("livepatch_sample", env_verbosity > VERBOSE_NONE);
+}
+
+static void read_proc_cmdline(void)
+{
+ char buf[4096];
+ int fd, ret;
+
+ fd = open("/proc/cmdline", O_RDONLY);
+ if (!ASSERT_OK_FD(fd, "open /proc/cmdline"))
+ return;
+
+ ret = read(fd, buf, sizeof(buf));
+ if (!ASSERT_GT(ret, 0, "read /proc/cmdline"))
+ goto out;
+
+ ASSERT_OK(strncmp(buf, "this has been live patched", 26), "strncmp");
+
+out:
+ close(fd);
+}
+
+static void __test_livepatch_trampoline(bool fexit_first)
+{
+ struct livepatch_trampoline *skel = NULL;
+ int err;
+
+ skel = livepatch_trampoline__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+ goto out;
+
+ skel->bss->my_pid = getpid();
+
+ if (!fexit_first) {
+ /* fentry program is loaded first by default */
+ err = livepatch_trampoline__attach(skel);
+ if (!ASSERT_OK(err, "skel_attach"))
+ goto out;
+ } else {
+ /* Manually load fexit program first. */
+ skel->links.fexit_cmdline = bpf_program__attach(skel->progs.fexit_cmdline);
+ if (!ASSERT_OK_PTR(skel->links.fexit_cmdline, "attach_fexit"))
+ goto out;
+
+ skel->links.fentry_cmdline = bpf_program__attach(skel->progs.fentry_cmdline);
+ if (!ASSERT_OK_PTR(skel->links.fentry_cmdline, "attach_fentry"))
+ goto out;
+ }
+
+ read_proc_cmdline();
+
+ ASSERT_EQ(skel->bss->fentry_hit, 1, "fentry_hit");
+ ASSERT_EQ(skel->bss->fexit_hit, 1, "fexit_hit");
+out:
+ livepatch_trampoline__destroy(skel);
+}
+
+void test_livepatch_trampoline(void)
+{
+ int retry_cnt = 0;
+
+retry:
+ if (load_livepatch()) {
+ if (retry_cnt) {
+ ASSERT_OK(1, "load_livepatch");
+ goto out;
+ }
+ /*
+ * Something else (previous run of the same test?) loaded
+ * the KLP module. Unload the KLP module and retry.
+ */
+ unload_livepatch();
+ retry_cnt++;
+ goto retry;
+ }
+
+ if (test__start_subtest("fentry_first"))
+ __test_livepatch_trampoline(false);
+
+ if (test__start_subtest("fexit_first"))
+ __test_livepatch_trampoline(true);
+out:
+ unload_livepatch();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c
index f8eb7f9d4fd2..8fade8bdc451 100644
--- a/tools/testing/selftests/bpf/prog_tests/mptcp.c
+++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c
@@ -6,11 +6,13 @@
#include <netinet/in.h>
#include <test_progs.h>
#include <unistd.h>
+#include <errno.h>
#include "cgroup_helpers.h"
#include "network_helpers.h"
#include "mptcp_sock.skel.h"
#include "mptcpify.skel.h"
#include "mptcp_subflow.skel.h"
+#include "mptcp_sockmap.skel.h"
#define NS_TEST "mptcp_ns"
#define ADDR_1 "10.0.1.1"
@@ -436,6 +438,142 @@ close_cgroup:
close(cgroup_fd);
}
+/* Test sockmap on MPTCP server handling non-mp-capable clients. */
+static void test_sockmap_with_mptcp_fallback(struct mptcp_sockmap *skel)
+{
+ int listen_fd = -1, client_fd1 = -1, client_fd2 = -1;
+ int server_fd1 = -1, server_fd2 = -1, sent, recvd;
+ char snd[9] = "123456789";
+ char rcv[10];
+
+ /* start server with MPTCP enabled */
+ listen_fd = start_mptcp_server(AF_INET, NULL, 0, 0);
+ if (!ASSERT_OK_FD(listen_fd, "sockmap-fb:start_mptcp_server"))
+ return;
+
+ skel->bss->trace_port = ntohs(get_socket_local_port(listen_fd));
+ skel->bss->sk_index = 0;
+ /* create client without MPTCP enabled */
+ client_fd1 = connect_to_fd_opts(listen_fd, NULL);
+ if (!ASSERT_OK_FD(client_fd1, "sockmap-fb:connect_to_fd"))
+ goto end;
+
+ server_fd1 = accept(listen_fd, NULL, 0);
+ skel->bss->sk_index = 1;
+ client_fd2 = connect_to_fd_opts(listen_fd, NULL);
+ if (!ASSERT_OK_FD(client_fd2, "sockmap-fb:connect_to_fd"))
+ goto end;
+
+ server_fd2 = accept(listen_fd, NULL, 0);
+ /* test normal redirect behavior: data sent by client_fd1 can be
+ * received by client_fd2
+ */
+ skel->bss->redirect_idx = 1;
+ sent = send(client_fd1, snd, sizeof(snd), 0);
+ if (!ASSERT_EQ(sent, sizeof(snd), "sockmap-fb:send(client_fd1)"))
+ goto end;
+
+ /* try to recv more bytes to avoid truncation check */
+ recvd = recv(client_fd2, rcv, sizeof(rcv), 0);
+ if (!ASSERT_EQ(recvd, sizeof(snd), "sockmap-fb:recv(client_fd2)"))
+ goto end;
+
+end:
+ if (client_fd1 >= 0)
+ close(client_fd1);
+ if (client_fd2 >= 0)
+ close(client_fd2);
+ if (server_fd1 >= 0)
+ close(server_fd1);
+ if (server_fd2 >= 0)
+ close(server_fd2);
+ close(listen_fd);
+}
+
+/* Test sockmap rejection of MPTCP sockets - both server and client sides. */
+static void test_sockmap_reject_mptcp(struct mptcp_sockmap *skel)
+{
+ int listen_fd = -1, server_fd = -1, client_fd1 = -1;
+ int err, zero = 0;
+
+ /* start server with MPTCP enabled */
+ listen_fd = start_mptcp_server(AF_INET, NULL, 0, 0);
+ if (!ASSERT_OK_FD(listen_fd, "start_mptcp_server"))
+ return;
+
+ skel->bss->trace_port = ntohs(get_socket_local_port(listen_fd));
+ skel->bss->sk_index = 0;
+ /* create client with MPTCP enabled */
+ client_fd1 = connect_to_fd(listen_fd, 0);
+ if (!ASSERT_OK_FD(client_fd1, "connect_to_fd client_fd1"))
+ goto end;
+
+ /* bpf_sock_map_update() called from sockops should reject MPTCP sk */
+ if (!ASSERT_EQ(skel->bss->helper_ret, -EOPNOTSUPP, "should reject"))
+ goto end;
+
+ server_fd = accept(listen_fd, NULL, 0);
+ err = bpf_map_update_elem(bpf_map__fd(skel->maps.sock_map),
+ &zero, &server_fd, BPF_NOEXIST);
+ if (!ASSERT_EQ(err, -EOPNOTSUPP, "server should be disallowed"))
+ goto end;
+
+ /* MPTCP client should also be disallowed */
+ err = bpf_map_update_elem(bpf_map__fd(skel->maps.sock_map),
+ &zero, &client_fd1, BPF_NOEXIST);
+ if (!ASSERT_EQ(err, -EOPNOTSUPP, "client should be disallowed"))
+ goto end;
+end:
+ if (client_fd1 >= 0)
+ close(client_fd1);
+ if (server_fd >= 0)
+ close(server_fd);
+ close(listen_fd);
+}
+
+static void test_mptcp_sockmap(void)
+{
+ struct mptcp_sockmap *skel;
+ struct netns_obj *netns;
+ int cgroup_fd, err;
+
+ cgroup_fd = test__join_cgroup("/mptcp_sockmap");
+ if (!ASSERT_OK_FD(cgroup_fd, "join_cgroup: mptcp_sockmap"))
+ return;
+
+ skel = mptcp_sockmap__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel_open_load: mptcp_sockmap"))
+ goto close_cgroup;
+
+ skel->links.mptcp_sockmap_inject =
+ bpf_program__attach_cgroup(skel->progs.mptcp_sockmap_inject, cgroup_fd);
+ if (!ASSERT_OK_PTR(skel->links.mptcp_sockmap_inject, "attach sockmap"))
+ goto skel_destroy;
+
+ err = bpf_prog_attach(bpf_program__fd(skel->progs.mptcp_sockmap_redirect),
+ bpf_map__fd(skel->maps.sock_map),
+ BPF_SK_SKB_STREAM_VERDICT, 0);
+ if (!ASSERT_OK(err, "bpf_prog_attach stream verdict"))
+ goto skel_destroy;
+
+ netns = netns_new(NS_TEST, true);
+ if (!ASSERT_OK_PTR(netns, "netns_new: mptcp_sockmap"))
+ goto skel_destroy;
+
+ if (endpoint_init("subflow") < 0)
+ goto close_netns;
+
+ test_sockmap_with_mptcp_fallback(skel);
+ test_sockmap_reject_mptcp(skel);
+
+close_netns:
+ netns_free(netns);
+skel_destroy:
+ mptcp_sockmap__destroy(skel);
+close_cgroup:
+ close(cgroup_fd);
+}
+
void test_mptcp(void)
{
if (test__start_subtest("base"))
@@ -444,4 +582,6 @@ void test_mptcp(void)
test_mptcpify();
if (test__start_subtest("subflow"))
test_subflow();
+ if (test__start_subtest("sockmap"))
+ test_mptcp_sockmap();
}
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c
new file mode 100644
index 000000000000..c9efdd2a5b18
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "stacktrace_ips.skel.h"
+
+#ifdef __x86_64__
+static int check_stacktrace_ips(int fd, __u32 key, int cnt, ...)
+{
+ __u64 ips[PERF_MAX_STACK_DEPTH];
+ struct ksyms *ksyms = NULL;
+ int i, err = 0;
+ va_list args;
+
+ /* sorted by addr */
+ ksyms = load_kallsyms_local();
+ if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_local"))
+ return -1;
+
+ /* unlikely, but... */
+ if (!ASSERT_LT(cnt, PERF_MAX_STACK_DEPTH, "check_max"))
+ return -1;
+
+ err = bpf_map_lookup_elem(fd, &key, ips);
+ if (err)
+ goto out;
+
+ /*
+ * Compare all symbols provided via arguments with stacktrace ips,
+ * and their related symbol addresses.t
+ */
+ va_start(args, cnt);
+
+ for (i = 0; i < cnt; i++) {
+ unsigned long val;
+ struct ksym *ksym;
+
+ val = va_arg(args, unsigned long);
+ ksym = ksym_search_local(ksyms, ips[i]);
+ if (!ASSERT_OK_PTR(ksym, "ksym_search_local"))
+ break;
+ ASSERT_EQ(ksym->addr, val, "stack_cmp");
+ }
+
+ va_end(args);
+
+out:
+ free_kallsyms_local(ksyms);
+ return err;
+}
+
+static void test_stacktrace_ips_kprobe_multi(bool retprobe)
+{
+ LIBBPF_OPTS(bpf_kprobe_multi_opts, opts,
+ .retprobe = retprobe
+ );
+ LIBBPF_OPTS(bpf_test_run_opts, topts);
+ struct stacktrace_ips *skel;
+
+ skel = stacktrace_ips__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load"))
+ return;
+
+ if (!skel->kconfig->CONFIG_UNWINDER_ORC) {
+ test__skip();
+ goto cleanup;
+ }
+
+ skel->links.kprobe_multi_test = bpf_program__attach_kprobe_multi_opts(
+ skel->progs.kprobe_multi_test,
+ "bpf_testmod_stacktrace_test", &opts);
+ if (!ASSERT_OK_PTR(skel->links.kprobe_multi_test, "bpf_program__attach_kprobe_multi_opts"))
+ goto cleanup;
+
+ trigger_module_test_read(1);
+
+ load_kallsyms();
+
+ check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4,
+ ksym_get_addr("bpf_testmod_stacktrace_test_3"),
+ ksym_get_addr("bpf_testmod_stacktrace_test_2"),
+ ksym_get_addr("bpf_testmod_stacktrace_test_1"),
+ ksym_get_addr("bpf_testmod_test_read"));
+
+cleanup:
+ stacktrace_ips__destroy(skel);
+}
+
+static void test_stacktrace_ips_raw_tp(void)
+{
+ __u32 info_len = sizeof(struct bpf_prog_info);
+ LIBBPF_OPTS(bpf_test_run_opts, topts);
+ struct bpf_prog_info info = {};
+ struct stacktrace_ips *skel;
+ __u64 bpf_prog_ksym = 0;
+ int err;
+
+ skel = stacktrace_ips__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load"))
+ return;
+
+ if (!skel->kconfig->CONFIG_UNWINDER_ORC) {
+ test__skip();
+ goto cleanup;
+ }
+
+ skel->links.rawtp_test = bpf_program__attach_raw_tracepoint(
+ skel->progs.rawtp_test,
+ "bpf_testmod_test_read");
+ if (!ASSERT_OK_PTR(skel->links.rawtp_test, "bpf_program__attach_raw_tracepoint"))
+ goto cleanup;
+
+ /* get bpf program address */
+ info.jited_ksyms = ptr_to_u64(&bpf_prog_ksym);
+ info.nr_jited_ksyms = 1;
+ err = bpf_prog_get_info_by_fd(bpf_program__fd(skel->progs.rawtp_test),
+ &info, &info_len);
+ if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd"))
+ goto cleanup;
+
+ trigger_module_test_read(1);
+
+ load_kallsyms();
+
+ check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 2,
+ bpf_prog_ksym,
+ ksym_get_addr("bpf_trace_run2"));
+
+cleanup:
+ stacktrace_ips__destroy(skel);
+}
+
+static void __test_stacktrace_ips(void)
+{
+ if (test__start_subtest("kprobe_multi"))
+ test_stacktrace_ips_kprobe_multi(false);
+ if (test__start_subtest("kretprobe_multi"))
+ test_stacktrace_ips_kprobe_multi(true);
+ if (test__start_subtest("raw_tp"))
+ test_stacktrace_ips_raw_tp();
+}
+#else
+static void __test_stacktrace_ips(void)
+{
+ test__skip();
+}
+#endif
+
+void test_stacktrace_ips(void)
+{
+ __test_stacktrace_ips();
+}
diff --git a/tools/testing/selftests/bpf/progs/iters_looping.c b/tools/testing/selftests/bpf/progs/iters_looping.c
index 05fa5ce7fc59..d00fd570255a 100644
--- a/tools/testing/selftests/bpf/progs/iters_looping.c
+++ b/tools/testing/selftests/bpf/progs/iters_looping.c
@@ -161,3 +161,56 @@ int simplest_loop(void *ctx)
return 0;
}
+
+__used
+static void iterator_with_diff_stack_depth(int x)
+{
+ struct bpf_iter_num iter;
+
+ asm volatile (
+ "if r1 == 42 goto 0f;"
+ "*(u64 *)(r10 - 128) = 0;"
+ "0:"
+ /* create iterator */
+ "r1 = %[iter];"
+ "r2 = 0;"
+ "r3 = 10;"
+ "call %[bpf_iter_num_new];"
+ "1:"
+ /* consume next item */
+ "r1 = %[iter];"
+ "call %[bpf_iter_num_next];"
+ "if r0 == 0 goto 2f;"
+ "goto 1b;"
+ "2:"
+ /* destroy iterator */
+ "r1 = %[iter];"
+ "call %[bpf_iter_num_destroy];"
+ :
+ : __imm_ptr(iter), ITER_HELPERS
+ : __clobber_common, "r6"
+ );
+}
+
+SEC("socket")
+__success
+__naked int widening_stack_size_bug(void *ctx)
+{
+ /*
+ * Depending on iterator_with_diff_stack_depth() parameter value,
+ * subprogram stack depth is either 8 or 128 bytes. Arrange values so
+ * that it is 128 on a first call and 8 on a second. This triggered a
+ * bug in verifier's widen_imprecise_scalars() logic.
+ */
+ asm volatile (
+ "r6 = 0;"
+ "r1 = 0;"
+ "1:"
+ "call iterator_with_diff_stack_depth;"
+ "r1 = 42;"
+ "r6 += 1;"
+ "if r6 < 2 goto 1b;"
+ "r0 = 0;"
+ "exit;"
+ ::: __clobber_all);
+}
diff --git a/tools/testing/selftests/bpf/progs/livepatch_trampoline.c b/tools/testing/selftests/bpf/progs/livepatch_trampoline.c
new file mode 100644
index 000000000000..15579d5bcd91
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/livepatch_trampoline.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+int fentry_hit;
+int fexit_hit;
+int my_pid;
+
+SEC("fentry/cmdline_proc_show")
+int BPF_PROG(fentry_cmdline)
+{
+ if (my_pid != (bpf_get_current_pid_tgid() >> 32))
+ return 0;
+
+ fentry_hit = 1;
+ return 0;
+}
+
+SEC("fexit/cmdline_proc_show")
+int BPF_PROG(fexit_cmdline)
+{
+ if (my_pid != (bpf_get_current_pid_tgid() >> 32))
+ return 0;
+
+ fexit_hit = 1;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/mptcp_sockmap.c b/tools/testing/selftests/bpf/progs/mptcp_sockmap.c
new file mode 100644
index 000000000000..d4eef0cbadb9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/mptcp_sockmap.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bpf_tracing_net.h"
+
+char _license[] SEC("license") = "GPL";
+
+int sk_index;
+int redirect_idx;
+int trace_port;
+int helper_ret;
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+ __uint(max_entries, 100);
+} sock_map SEC(".maps");
+
+SEC("sockops")
+int mptcp_sockmap_inject(struct bpf_sock_ops *skops)
+{
+ struct bpf_sock *sk;
+
+ /* only accept specified connection */
+ if (skops->local_port != trace_port ||
+ skops->op != BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB)
+ return 1;
+
+ sk = skops->sk;
+ if (!sk)
+ return 1;
+
+ /* update sk handler */
+ helper_ret = bpf_sock_map_update(skops, &sock_map, &sk_index, BPF_NOEXIST);
+
+ return 1;
+}
+
+SEC("sk_skb/stream_verdict")
+int mptcp_sockmap_redirect(struct __sk_buff *skb)
+{
+ /* redirect skb to the sk under sock_map[redirect_idx] */
+ return bpf_sk_redirect_map(skb, &sock_map, redirect_idx, 0);
+}
diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c
new file mode 100644
index 000000000000..a96c8150d7f5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#ifndef PERF_MAX_STACK_DEPTH
+#define PERF_MAX_STACK_DEPTH 127
+#endif
+
+typedef __u64 stack_trace_t[PERF_MAX_STACK_DEPTH];
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(max_entries, 16384);
+ __type(key, __u32);
+ __type(value, stack_trace_t);
+} stackmap SEC(".maps");
+
+extern bool CONFIG_UNWINDER_ORC __kconfig __weak;
+
+/*
+ * This function is here to have CONFIG_UNWINDER_ORC
+ * used and added to object BTF.
+ */
+int unused(void)
+{
+ return CONFIG_UNWINDER_ORC ? 0 : 1;
+}
+
+__u32 stack_key;
+
+SEC("kprobe.multi")
+int kprobe_multi_test(struct pt_regs *ctx)
+{
+ stack_key = bpf_get_stackid(ctx, &stackmap, 0);
+ return 0;
+}
+
+SEC("raw_tp/bpf_testmod_test_read")
+int rawtp_test(void *ctx)
+{
+ /* Skip ebpf program entry in the stack. */
+ stack_key = bpf_get_stackid(ctx, &stackmap, 0);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c
index b4a0d0cc8ec8..3662515f0107 100644
--- a/tools/testing/selftests/bpf/progs/stream_fail.c
+++ b/tools/testing/selftests/bpf/progs/stream_fail.c
@@ -10,7 +10,7 @@ SEC("syscall")
__failure __msg("Possibly NULL pointer passed")
int stream_vprintk_null_arg(void *ctx)
{
- bpf_stream_vprintk(BPF_STDOUT, "", NULL, 0, NULL);
+ bpf_stream_vprintk_impl(BPF_STDOUT, "", NULL, 0, NULL);
return 0;
}
@@ -18,7 +18,7 @@ SEC("syscall")
__failure __msg("R3 type=scalar expected=")
int stream_vprintk_scalar_arg(void *ctx)
{
- bpf_stream_vprintk(BPF_STDOUT, "", (void *)46, 0, NULL);
+ bpf_stream_vprintk_impl(BPF_STDOUT, "", (void *)46, 0, NULL);
return 0;
}
@@ -26,7 +26,7 @@ SEC("syscall")
__failure __msg("arg#1 doesn't point to a const string")
int stream_vprintk_string_arg(void *ctx)
{
- bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0, NULL);
+ bpf_stream_vprintk_impl(BPF_STDOUT, ctx, NULL, 0, NULL);
return 0;
}
diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c
index 23217f06a3ec..663a80990f8f 100644
--- a/tools/testing/selftests/bpf/progs/task_work.c
+++ b/tools/testing/selftests/bpf/progs/task_work.c
@@ -66,7 +66,7 @@ int oncpu_hash_map(struct pt_regs *args)
if (!work)
return 0;
- bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL);
+ bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL);
return 0;
}
@@ -80,7 +80,7 @@ int oncpu_array_map(struct pt_regs *args)
work = bpf_map_lookup_elem(&arrmap, &key);
if (!work)
return 0;
- bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work, NULL);
+ bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, process_work, NULL);
return 0;
}
@@ -102,6 +102,6 @@ int oncpu_lru_map(struct pt_regs *args)
work = bpf_map_lookup_elem(&lrumap, &key);
if (!work || work->data[0])
return 0;
- bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work, NULL);
+ bpf_task_work_schedule_resume_impl(task, &work->tw, &lrumap, process_work, NULL);
return 0;
}
diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c
index 77fe8f28facd..1270953fd092 100644
--- a/tools/testing/selftests/bpf/progs/task_work_fail.c
+++ b/tools/testing/selftests/bpf/progs/task_work_fail.c
@@ -53,7 +53,7 @@ int mismatch_map(struct pt_regs *args)
work = bpf_map_lookup_elem(&arrmap, &key);
if (!work)
return 0;
- bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL);
+ bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL);
return 0;
}
@@ -65,7 +65,7 @@ int no_map_task_work(struct pt_regs *args)
struct bpf_task_work tw;
task = bpf_get_current_task_btf();
- bpf_task_work_schedule_resume(task, &tw, &hmap, process_work, NULL);
+ bpf_task_work_schedule_resume_impl(task, &tw, &hmap, process_work, NULL);
return 0;
}
@@ -76,7 +76,7 @@ int task_work_null(struct pt_regs *args)
struct task_struct *task;
task = bpf_get_current_task_btf();
- bpf_task_work_schedule_resume(task, NULL, &hmap, process_work, NULL);
+ bpf_task_work_schedule_resume_impl(task, NULL, &hmap, process_work, NULL);
return 0;
}
@@ -91,6 +91,6 @@ int map_null(struct pt_regs *args)
work = bpf_map_lookup_elem(&arrmap, &key);
if (!work)
return 0;
- bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work, NULL);
+ bpf_task_work_schedule_resume_impl(task, &work->tw, NULL, process_work, NULL);
return 0;
}
diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c
index 90fca06fff56..55e555f7f41b 100644
--- a/tools/testing/selftests/bpf/progs/task_work_stress.c
+++ b/tools/testing/selftests/bpf/progs/task_work_stress.c
@@ -51,8 +51,8 @@ int schedule_task_work(void *ctx)
if (!work)
return 0;
}
- err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap,
- process_work, NULL);
+ err = bpf_task_work_schedule_signal_impl(bpf_get_current_task_btf(), &work->tw, &hmap,
+ process_work, NULL);
if (err)
__sync_fetch_and_add(&schedule_error, 1);
else
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index 8074bc5f6f20..ed0a4721d8fd 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -417,6 +417,30 @@ noinline int bpf_testmod_fentry_test11(u64 a, void *b, short c, int d,
return a + (long)b + c + d + (long)e + f + g + h + i + j + k;
}
+noinline void bpf_testmod_stacktrace_test(void)
+{
+ /* used for stacktrace test as attach function */
+ asm volatile ("");
+}
+
+noinline void bpf_testmod_stacktrace_test_3(void)
+{
+ bpf_testmod_stacktrace_test();
+ asm volatile ("");
+}
+
+noinline void bpf_testmod_stacktrace_test_2(void)
+{
+ bpf_testmod_stacktrace_test_3();
+ asm volatile ("");
+}
+
+noinline void bpf_testmod_stacktrace_test_1(void)
+{
+ bpf_testmod_stacktrace_test_2();
+ asm volatile ("");
+}
+
int bpf_testmod_fentry_ok;
noinline ssize_t
@@ -497,6 +521,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj,
21, 22, 23, 24, 25, 26) != 231)
goto out;
+ bpf_testmod_stacktrace_test_1();
+
bpf_testmod_fentry_ok = 1;
out:
return -EIO; /* always fail */
diff --git a/tools/testing/selftests/cachestat/.gitignore b/tools/testing/selftests/cachestat/.gitignore
index d6c30b43a4bb..abbb13b6e96b 100644
--- a/tools/testing/selftests/cachestat/.gitignore
+++ b/tools/testing/selftests/cachestat/.gitignore
@@ -1,2 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
test_cachestat
+tmpshmcstat
diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c
index c952640f163b..ab838bcb9ec5 100644
--- a/tools/testing/selftests/cachestat/test_cachestat.c
+++ b/tools/testing/selftests/cachestat/test_cachestat.c
@@ -226,7 +226,7 @@ bool run_cachestat_test(enum file_type type)
int syscall_ret;
size_t compute_len = PS * 512;
struct cachestat_range cs_range = { PS, compute_len };
- char *filename = "tmpshmcstat";
+ char *filename = "tmpshmcstat", *map;
struct cachestat cs;
bool ret = true;
int fd;
@@ -257,7 +257,7 @@ bool run_cachestat_test(enum file_type type)
}
break;
case FILE_MMAP:
- char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE,
+ map = mmap(NULL, filesize, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index a360e2eb2eef..1d778c8b7764 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -923,8 +923,10 @@ struct corecg_test {
int main(int argc, char *argv[])
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) {
if (setup_named_v1_root(root, sizeof(root), CG_NAMED_NAME))
ksft_exit_skip("cgroup v2 isn't mounted and could not setup named v1 hierarchy\n");
@@ -946,12 +948,11 @@ post_v2_setup:
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
cleanup_named_v1_root(root);
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c
index d54e2317efff..b1b30e82dd7c 100644
--- a/tools/testing/selftests/cgroup/test_cpu.c
+++ b/tools/testing/selftests/cgroup/test_cpu.c
@@ -796,8 +796,10 @@ struct cpucg_test {
int main(int argc, char *argv[])
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
@@ -814,11 +816,10 @@ int main(int argc, char *argv[])
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_cpuset.c b/tools/testing/selftests/cgroup/test_cpuset.c
index 4034d14ba69a..8086d2ea394f 100644
--- a/tools/testing/selftests/cgroup/test_cpuset.c
+++ b/tools/testing/selftests/cgroup/test_cpuset.c
@@ -247,8 +247,10 @@ struct cpuset_test {
int main(int argc, char *argv[])
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
@@ -265,11 +267,10 @@ int main(int argc, char *argv[])
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c
index dfb763819581..465cdad2bfca 100644
--- a/tools/testing/selftests/cgroup/test_freezer.c
+++ b/tools/testing/selftests/cgroup/test_freezer.c
@@ -1488,8 +1488,10 @@ struct cgfreezer_test {
int main(int argc, char *argv[])
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
for (i = 0; i < ARRAY_SIZE(tests); i++) {
@@ -1501,11 +1503,10 @@ int main(int argc, char *argv[])
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c
index 0e5bb6c7307a..ed590b150a17 100644
--- a/tools/testing/selftests/cgroup/test_kill.c
+++ b/tools/testing/selftests/cgroup/test_kill.c
@@ -274,8 +274,10 @@ struct cgkill_test {
int main(int argc, char *argv[])
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
for (i = 0; i < ARRAY_SIZE(tests); i++) {
@@ -287,11 +289,10 @@ int main(int argc, char *argv[])
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c
index 63b3c9aad399..d4c4a514ee43 100644
--- a/tools/testing/selftests/cgroup/test_kmem.c
+++ b/tools/testing/selftests/cgroup/test_kmem.c
@@ -421,8 +421,10 @@ struct kmem_test {
int main(int argc, char **argv)
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
@@ -446,11 +448,10 @@ int main(int argc, char **argv)
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index a680f773f2d5..b117325c0439 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -1650,8 +1650,10 @@ struct memcg_test {
int main(int argc, char **argv)
{
char root[PATH_MAX];
- int i, proc_status, ret = EXIT_SUCCESS;
+ int i, proc_status;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
@@ -1685,11 +1687,10 @@ int main(int argc, char **argv)
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
index e1f578ca2841..86a8930b47e3 100644
--- a/tools/testing/selftests/cgroup/test_zswap.c
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -597,8 +597,10 @@ static bool zswap_configured(void)
int main(int argc, char **argv)
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
@@ -625,11 +627,10 @@ int main(int argc, char **argv)
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/coredump/.gitignore b/tools/testing/selftests/coredump/.gitignore
new file mode 100644
index 000000000000..097f52db0be9
--- /dev/null
+++ b/tools/testing/selftests/coredump/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+stackdump_test
+coredump_socket_test
+coredump_socket_protocol_test
diff --git a/tools/testing/selftests/coredump/Makefile b/tools/testing/selftests/coredump/Makefile
index 77b3665c73c7..dece1a31d561 100644
--- a/tools/testing/selftests/coredump/Makefile
+++ b/tools/testing/selftests/coredump/Makefile
@@ -1,7 +1,13 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
-TEST_GEN_PROGS := stackdump_test
+TEST_GEN_PROGS := stackdump_test \
+ coredump_socket_test \
+ coredump_socket_protocol_test
TEST_FILES := stackdump
include ../lib.mk
+
+$(OUTPUT)/stackdump_test: coredump_test_helpers.c
+$(OUTPUT)/coredump_socket_test: coredump_test_helpers.c
+$(OUTPUT)/coredump_socket_protocol_test: coredump_test_helpers.c
diff --git a/tools/testing/selftests/coredump/coredump_socket_protocol_test.c b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c
new file mode 100644
index 000000000000..d19b6717c53e
--- /dev/null
+++ b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c
@@ -0,0 +1,1568 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/stat.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "coredump_test.h"
+
+#define NUM_CRASHING_COREDUMPS 5
+
+FIXTURE_SETUP(coredump)
+{
+ FILE *file;
+ int ret;
+
+ self->pid_coredump_server = -ESRCH;
+ self->fd_tmpfs_detached = -1;
+ file = fopen("/proc/sys/kernel/core_pattern", "r");
+ ASSERT_NE(NULL, file);
+
+ ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file);
+ ASSERT_TRUE(ret || feof(file));
+ ASSERT_LT(ret, sizeof(self->original_core_pattern));
+
+ self->original_core_pattern[ret] = '\0';
+ self->fd_tmpfs_detached = create_detached_tmpfs();
+ ASSERT_GE(self->fd_tmpfs_detached, 0);
+
+ ret = fclose(file);
+ ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(coredump)
+{
+ const char *reason;
+ FILE *file;
+ int ret, status;
+
+ if (self->pid_coredump_server > 0) {
+ kill(self->pid_coredump_server, SIGTERM);
+ waitpid(self->pid_coredump_server, &status, 0);
+ }
+ unlink("/tmp/coredump.file");
+ unlink("/tmp/coredump.socket");
+
+ file = fopen("/proc/sys/kernel/core_pattern", "w");
+ if (!file) {
+ reason = "Unable to open core_pattern";
+ goto fail;
+ }
+
+ ret = fprintf(file, "%s", self->original_core_pattern);
+ if (ret < 0) {
+ reason = "Unable to write to core_pattern";
+ goto fail;
+ }
+
+ ret = fclose(file);
+ if (ret) {
+ reason = "Unable to close core_pattern";
+ goto fail;
+ }
+
+ if (self->fd_tmpfs_detached >= 0) {
+ ret = close(self->fd_tmpfs_detached);
+ if (ret < 0) {
+ reason = "Unable to close detached tmpfs";
+ goto fail;
+ }
+ self->fd_tmpfs_detached = -1;
+ }
+
+ return;
+fail:
+ /* This should never happen */
+ fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason);
+}
+
+TEST_F(coredump, socket_request_kernel)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct stat st;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_kernel: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_kernel: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_kernel: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_kernel: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_kernel: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_kernel: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_kernel: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ fd_core_file = creat("/tmp/coredump.file", 0644);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket_request_kernel: creat coredump file failed: %m\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_kernel: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_kernel: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_KERNEL | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_request_kernel: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_request_kernel: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_request_kernel: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ if (bytes_write < 0 && errno == ENOSPC)
+ continue;
+ fprintf(stderr, "socket_request_kernel: write to core file failed (read=%zd, write=%zd): %m\n",
+ bytes_read, bytes_write);
+ goto out;
+ }
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_kernel: completed successfully\n");
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+
+ ASSERT_EQ(stat("/tmp/coredump.file", &st), 0);
+ ASSERT_GT(st.st_size, 0);
+ system("file /tmp/coredump.file");
+}
+
+TEST_F(coredump, socket_request_userspace)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_userspace: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_userspace: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_userspace: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_userspace: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_userspace: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_userspace: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_userspace: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_userspace: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_userspace: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_USERSPACE | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_request_userspace: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_request_userspace: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read > 0) {
+ fprintf(stderr, "socket_request_userspace: unexpected data received (expected no coredump data)\n");
+ goto out;
+ }
+
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_request_userspace: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_userspace: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_reject)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_reject: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_reject: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_reject: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_reject: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_reject: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_reject: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_reject: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_reject: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_reject: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_request_reject: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_request_reject: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read > 0) {
+ fprintf(stderr, "socket_request_reject: unexpected data received (expected no coredump data for REJECT)\n");
+ goto out;
+ }
+
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_request_reject: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_reject: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_invalid_flag_combination)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: read_marker COREDUMP_MARK_CONFLICTING failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_invalid_flag_combination: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_unknown_flag)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_unknown_flag: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_unknown_flag: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_unknown_flag: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_unknown_flag: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_unknown_flag: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_unknown_flag: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_unknown_flag: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_unknown_flag: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_unknown_flag: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0)) {
+ fprintf(stderr, "socket_request_unknown_flag: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED)) {
+ fprintf(stderr, "socket_request_unknown_flag: read_marker COREDUMP_MARK_UNSUPPORTED failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_unknown_flag: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_invalid_size_small)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_invalid_size_small: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_invalid_size_small: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_invalid_size_small: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_invalid_size_small: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_invalid_size_small: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_invalid_size_small: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_invalid_size_small: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_invalid_size_small: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_invalid_size_small: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT,
+ COREDUMP_ACK_SIZE_VER0 / 2)) {
+ fprintf(stderr, "socket_request_invalid_size_small: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE)) {
+ fprintf(stderr, "socket_request_invalid_size_small: read_marker COREDUMP_MARK_MINSIZE failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_invalid_size_small: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_invalid_size_large)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_invalid_size_large: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_invalid_size_large: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_invalid_size_large: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_invalid_size_large: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_invalid_size_large: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_invalid_size_large: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_invalid_size_large: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_invalid_size_large: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_invalid_size_large: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT,
+ COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE)) {
+ fprintf(stderr, "socket_request_invalid_size_large: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE)) {
+ fprintf(stderr, "socket_request_invalid_size_large: read_marker COREDUMP_MARK_MAXSIZE failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_invalid_size_large: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+/*
+ * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGSEGV
+ *
+ * Verify that when using socket-based coredump protocol,
+ * the coredump_signal field is correctly exposed as SIGSEGV.
+ */
+TEST_F(coredump, socket_coredump_signal_sigsegv)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ /* Verify coredump_signal is available and correct */
+ if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_signal != SIGSEGV) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n",
+ info.coredump_signal, SIGSEGV);
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGSEGV);
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
+ ASSERT_EQ(info.coredump_signal, SIGSEGV);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+/*
+ * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGABRT
+ *
+ * Verify that when using socket-based coredump protocol,
+ * the coredump_signal field is correctly exposed as SIGABRT.
+ */
+TEST_F(coredump, socket_coredump_signal_sigabrt)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ /* Verify coredump_signal is available and correct */
+ if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_signal != SIGABRT) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n",
+ info.coredump_signal, SIGABRT);
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ abort();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGABRT);
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
+ ASSERT_EQ(info.coredump_signal, SIGABRT);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500)
+{
+ int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS];
+ pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ int exit_code = EXIT_FAILURE;
+ struct coredump_req req = {};
+
+ close(ipc_sockets[0]);
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "Failed to create and listen on unix socket\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "Failed to notify parent via ipc socket\n");
+ goto out;
+ }
+ close(ipc_sockets[1]);
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump);
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd);
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd);
+ goto out;
+ }
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd);
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_KERNEL | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump);
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ if (bytes_write < 0 && errno == ENOSPC)
+ continue;
+ fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file);
+ goto out;
+ }
+ }
+
+ close(fd_core_file);
+ close(fd_peer_pidfd);
+ close(fd_coredump);
+ fd_peer_pidfd = -1;
+ fd_coredump = -1;
+ }
+
+ exit_code = EXIT_SUCCESS;
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ pid[i] = fork();
+ ASSERT_GE(pid[i], 0);
+ if (pid[i] == 0)
+ crashing_child();
+ pidfd[i] = sys_pidfd_open(pid[i], 0);
+ ASSERT_GE(pidfd[i], 0);
+ }
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ waitpid(pid[i], &status[i], 0);
+ ASSERT_TRUE(WIFSIGNALED(status[i]));
+ ASSERT_TRUE(WCOREDUMP(status[i]));
+ }
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
+ ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0);
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+ }
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500)
+{
+ int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS];
+ pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS];
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0;
+ fd_server = -1;
+ exit_code = EXIT_FAILURE;
+ n_conns = 0;
+ close(ipc_sockets[0]);
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+ close(ipc_sockets[1]);
+
+ while (n_conns < NUM_CRASHING_COREDUMPS) {
+ int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ struct coredump_req req = {};
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ continue;
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: accept4 failed: %m\n");
+ goto out;
+ }
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_peer_pidfd failed\n");
+ goto out;
+ }
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_pidfd_info failed\n");
+ goto out;
+ }
+ if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: missing PIDFD_INFO_COREDUMP or PIDFD_COREDUMPED\n");
+ goto out;
+ }
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_coredump_req failed\n");
+ goto out;
+ }
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: check_coredump_req failed\n");
+ goto out;
+ }
+ if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: send_coredump_ack failed\n");
+ goto out;
+ }
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_marker failed\n");
+ goto out;
+ }
+ fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: open_coredump_tmpfile failed: %m\n");
+ goto out;
+ }
+ pid_t worker = fork();
+ if (worker == 0) {
+ close(fd_server);
+ process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file);
+ }
+ worker_pids[n_conns] = worker;
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ n_conns++;
+ }
+ exit_code = EXIT_SUCCESS;
+out:
+ if (fd_server >= 0)
+ close(fd_server);
+
+ // Reap all worker processes
+ for (int i = 0; i < n_conns; i++) {
+ int wstatus;
+ if (waitpid(worker_pids[i], &wstatus, 0) < 0) {
+ fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]);
+ } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) {
+ fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus));
+ exit_code = EXIT_FAILURE;
+ }
+ }
+
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ pid[i] = fork();
+ ASSERT_GE(pid[i], 0);
+ if (pid[i] == 0)
+ crashing_child();
+ pidfd[i] = sys_pidfd_open(pid[i], 0);
+ ASSERT_GE(pidfd[i], 0);
+ }
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ ASSERT_GE(waitpid(pid[i], &status[i], 0), 0);
+ ASSERT_TRUE(WIFSIGNALED(status[i]));
+ ASSERT_TRUE(WCOREDUMP(status[i]));
+ }
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
+ ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0);
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+ }
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/coredump/coredump_socket_test.c b/tools/testing/selftests/coredump/coredump_socket_test.c
new file mode 100644
index 000000000000..7e26d4a6a15d
--- /dev/null
+++ b/tools/testing/selftests/coredump/coredump_socket_test.c
@@ -0,0 +1,742 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/stat.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "coredump_test.h"
+
+FIXTURE_SETUP(coredump)
+{
+ FILE *file;
+ int ret;
+
+ self->pid_coredump_server = -ESRCH;
+ self->fd_tmpfs_detached = -1;
+ file = fopen("/proc/sys/kernel/core_pattern", "r");
+ ASSERT_NE(NULL, file);
+
+ ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file);
+ ASSERT_TRUE(ret || feof(file));
+ ASSERT_LT(ret, sizeof(self->original_core_pattern));
+
+ self->original_core_pattern[ret] = '\0';
+ self->fd_tmpfs_detached = create_detached_tmpfs();
+ ASSERT_GE(self->fd_tmpfs_detached, 0);
+
+ ret = fclose(file);
+ ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(coredump)
+{
+ const char *reason;
+ FILE *file;
+ int ret, status;
+
+ if (self->pid_coredump_server > 0) {
+ kill(self->pid_coredump_server, SIGTERM);
+ waitpid(self->pid_coredump_server, &status, 0);
+ }
+ unlink("/tmp/coredump.file");
+ unlink("/tmp/coredump.socket");
+
+ file = fopen("/proc/sys/kernel/core_pattern", "w");
+ if (!file) {
+ reason = "Unable to open core_pattern";
+ goto fail;
+ }
+
+ ret = fprintf(file, "%s", self->original_core_pattern);
+ if (ret < 0) {
+ reason = "Unable to write to core_pattern";
+ goto fail;
+ }
+
+ ret = fclose(file);
+ if (ret) {
+ reason = "Unable to close core_pattern";
+ goto fail;
+ }
+
+ if (self->fd_tmpfs_detached >= 0) {
+ ret = close(self->fd_tmpfs_detached);
+ if (ret < 0) {
+ reason = "Unable to close detached tmpfs";
+ goto fail;
+ }
+ self->fd_tmpfs_detached = -1;
+ }
+
+ return;
+fail:
+ /* This should never happen */
+ fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason);
+}
+
+TEST_F(coredump, socket)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct stat st;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket test: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket test: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket test: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket test: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket test: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket test: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket test: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ fd_core_file = creat("/tmp/coredump.file", 0644);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket test: creat coredump file failed: %m\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket test: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ if (bytes_write < 0 && errno == ENOSPC)
+ continue;
+ fprintf(stderr, "socket test: write to core file failed (read=%zd, write=%zd): %m\n", bytes_read, bytes_write);
+ goto out;
+ }
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket test: completed successfully\n");
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+
+ ASSERT_EQ(stat("/tmp/coredump.file", &st), 0);
+ ASSERT_GT(st.st_size, 0);
+}
+
+TEST_F(coredump, socket_detect_userspace_client)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct stat st;
+ struct pidfd_info info = {
+ .mask = PIDFD_INFO_COREDUMP,
+ };
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_detect_userspace_client: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_detect_userspace_client: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_detect_userspace_client: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_detect_userspace_client: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_detect_userspace_client: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_detect_userspace_client: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_mask & PIDFD_COREDUMPED) {
+ fprintf(stderr, "socket_detect_userspace_client: PIDFD_COREDUMPED incorrectly set (should be userspace client)\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_detect_userspace_client: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0) {
+ int fd_socket;
+ ssize_t ret;
+ const struct sockaddr_un coredump_sk = {
+ .sun_family = AF_UNIX,
+ .sun_path = "/tmp/coredump.socket",
+ };
+ size_t coredump_sk_len =
+ offsetof(struct sockaddr_un, sun_path) +
+ sizeof("/tmp/coredump.socket");
+
+ fd_socket = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd_socket < 0) {
+ fprintf(stderr, "socket_detect_userspace_client (client): socket failed: %m\n");
+ _exit(EXIT_FAILURE);
+ }
+
+ ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
+ if (ret < 0) {
+ fprintf(stderr, "socket_detect_userspace_client (client): connect failed: %m\n");
+ _exit(EXIT_FAILURE);
+ }
+
+ close(fd_socket);
+ pause();
+ fprintf(stderr, "socket_detect_userspace_client (client): completed successfully\n");
+ _exit(EXIT_SUCCESS);
+ }
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+
+ ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0);
+ ASSERT_EQ(close(pidfd), 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ ASSERT_NE(stat("/tmp/coredump.file", &st), 0);
+ ASSERT_EQ(errno, ENOENT);
+}
+
+TEST_F(coredump, socket_enoent)
+{
+ int pidfd, status;
+ pid_t pid;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+}
+
+TEST_F(coredump, socket_no_listener)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ int ipc_sockets[2];
+ char c;
+ const struct sockaddr_un coredump_sk = {
+ .sun_family = AF_UNIX,
+ .sun_path = "/tmp/coredump.socket",
+ };
+ size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) +
+ sizeof("/tmp/coredump.socket");
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_no_listener: socket failed: %m\n");
+ goto out;
+ }
+
+ ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
+ if (ret < 0) {
+ fprintf(stderr, "socket_no_listener: bind failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_no_listener: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_no_listener: completed successfully\n");
+out:
+ if (fd_server >= 0)
+ close(fd_server);
+ close(ipc_sockets[1]);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+/*
+ * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump
+ *
+ * Verify that when using simple socket-based coredump (@ pattern),
+ * the coredump_signal field is correctly exposed as SIGSEGV.
+ */
+TEST_F(coredump, socket_coredump_signal_sigsegv)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ /* Verify coredump_signal is available and correct */
+ if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_signal != SIGSEGV) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n",
+ info.coredump_signal, SIGSEGV);
+ goto out;
+ }
+
+ fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: open_coredump_tmpfile failed: %m\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: write to core file failed (read=%zd, write=%zd): %m\n",
+ bytes_read, bytes_write);
+ goto out;
+ }
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n");
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGSEGV);
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
+ ASSERT_EQ(info.coredump_signal, SIGSEGV);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+/*
+ * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump with SIGABRT
+ *
+ * Verify that when using simple socket-based coredump (@ pattern),
+ * the coredump_signal field is correctly exposed as SIGABRT.
+ */
+TEST_F(coredump, socket_coredump_signal_sigabrt)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ /* Verify coredump_signal is available and correct */
+ if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_signal != SIGABRT) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n",
+ info.coredump_signal, SIGABRT);
+ goto out;
+ }
+
+ fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: open_coredump_tmpfile failed: %m\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: write to core file failed (read=%zd, write=%zd): %m\n",
+ bytes_read, bytes_write);
+ goto out;
+ }
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n");
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ abort();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGABRT);
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
+ ASSERT_EQ(info.coredump_signal, SIGABRT);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_invalid_paths)
+{
+ ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@../coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/.."));
+ ASSERT_FALSE(set_core_pattern("@.."));
+
+ ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@@../coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/.."));
+ ASSERT_FALSE(set_core_pattern("@@.."));
+
+ ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket"));
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/coredump/coredump_test.h b/tools/testing/selftests/coredump/coredump_test.h
new file mode 100644
index 000000000000..ed47f01fa53c
--- /dev/null
+++ b/tools/testing/selftests/coredump/coredump_test.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __COREDUMP_TEST_H
+#define __COREDUMP_TEST_H
+
+#include <stdbool.h>
+#include <sys/types.h>
+#include <linux/coredump.h>
+
+#include "../kselftest_harness.h"
+#include "../pidfd/pidfd.h"
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+#define NUM_THREAD_SPAWN 128
+
+/* Coredump fixture */
+FIXTURE(coredump)
+{
+ char original_core_pattern[256];
+ pid_t pid_coredump_server;
+ int fd_tmpfs_detached;
+};
+
+/* Shared helper function declarations */
+void *do_nothing(void *arg);
+void crashing_child(void);
+int create_detached_tmpfs(void);
+int create_and_listen_unix_socket(const char *path);
+bool set_core_pattern(const char *pattern);
+int get_peer_pidfd(int fd);
+bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info);
+
+/* Inline helper that uses harness types */
+static inline void wait_and_check_coredump_server(pid_t pid_coredump_server,
+ struct __test_metadata *const _metadata,
+ FIXTURE_DATA(coredump) *self)
+{
+ int status;
+ waitpid(pid_coredump_server, &status, 0);
+ self->pid_coredump_server = -ESRCH;
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+}
+
+/* Protocol helper function declarations */
+ssize_t recv_marker(int fd);
+bool read_marker(int fd, enum coredump_mark mark);
+bool read_coredump_req(int fd, struct coredump_req *req);
+bool send_coredump_ack(int fd, const struct coredump_req *req,
+ __u64 mask, size_t size_ack);
+bool check_coredump_req(const struct coredump_req *req, size_t min_size,
+ __u64 required_mask);
+int open_coredump_tmpfile(int fd_tmpfs_detached);
+void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file);
+
+#endif /* __COREDUMP_TEST_H */
diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c
new file mode 100644
index 000000000000..a6f6d5f2ae07
--- /dev/null
+++ b/tools/testing/selftests/coredump/coredump_test_helpers.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/coredump.h>
+#include <linux/fs.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../filesystems/wrappers.h"
+#include "../pidfd/pidfd.h"
+
+/* Forward declarations to avoid including harness header */
+struct __test_metadata;
+
+/* Match the fixture definition from coredump_test.h */
+struct _fixture_coredump_data {
+ char original_core_pattern[256];
+ pid_t pid_coredump_server;
+ int fd_tmpfs_detached;
+};
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+#define NUM_THREAD_SPAWN 128
+
+void *do_nothing(void *arg)
+{
+ (void)arg;
+ while (1)
+ pause();
+
+ return NULL;
+}
+
+void crashing_child(void)
+{
+ pthread_t thread;
+ int i;
+
+ for (i = 0; i < NUM_THREAD_SPAWN; ++i)
+ pthread_create(&thread, NULL, do_nothing, NULL);
+
+ /* crash on purpose */
+ i = *(int *)NULL;
+}
+
+int create_detached_tmpfs(void)
+{
+ int fd_context, fd_tmpfs;
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ if (fd_context < 0)
+ return -1;
+
+ if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
+ return -1;
+
+ fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ close(fd_context);
+ return fd_tmpfs;
+}
+
+int create_and_listen_unix_socket(const char *path)
+{
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ assert(strlen(path) < sizeof(addr.sun_path) - 1);
+ strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1);
+ size_t addr_len =
+ offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1;
+ int fd, ret;
+
+ fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ if (fd < 0)
+ goto out;
+
+ ret = bind(fd, (const struct sockaddr *)&addr, addr_len);
+ if (ret < 0)
+ goto out;
+
+ ret = listen(fd, 128);
+ if (ret < 0)
+ goto out;
+
+ return fd;
+
+out:
+ if (fd >= 0)
+ close(fd);
+ return -1;
+}
+
+bool set_core_pattern(const char *pattern)
+{
+ int fd;
+ ssize_t ret;
+
+ fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC);
+ if (fd < 0)
+ return false;
+
+ ret = write(fd, pattern, strlen(pattern));
+ close(fd);
+ if (ret < 0)
+ return false;
+
+ fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern));
+ return ret == strlen(pattern);
+}
+
+int get_peer_pidfd(int fd)
+{
+ int fd_peer_pidfd;
+ socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd);
+ int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd,
+ &fd_peer_pidfd_len);
+ if (ret < 0) {
+ fprintf(stderr, "get_peer_pidfd: getsockopt(SO_PEERPIDFD) failed: %m\n");
+ return -1;
+ }
+ fprintf(stderr, "get_peer_pidfd: successfully retrieved pidfd %d\n", fd_peer_pidfd);
+ return fd_peer_pidfd;
+}
+
+bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info)
+{
+ int ret;
+ memset(info, 0, sizeof(*info));
+ info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL;
+ ret = ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info);
+ if (ret < 0) {
+ fprintf(stderr, "get_pidfd_info: ioctl(PIDFD_GET_INFO) failed: %m\n");
+ return false;
+ }
+ fprintf(stderr, "get_pidfd_info: mask=0x%llx, coredump_mask=0x%x, coredump_signal=%d\n",
+ (unsigned long long)info->mask, info->coredump_mask, info->coredump_signal);
+ return true;
+}
+
+/* Protocol helper functions */
+
+ssize_t recv_marker(int fd)
+{
+ enum coredump_mark mark = COREDUMP_MARK_REQACK;
+ ssize_t ret;
+
+ ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL);
+ if (ret != sizeof(mark))
+ return -1;
+
+ switch (mark) {
+ case COREDUMP_MARK_REQACK:
+ fprintf(stderr, "Received marker: ReqAck\n");
+ return COREDUMP_MARK_REQACK;
+ case COREDUMP_MARK_MINSIZE:
+ fprintf(stderr, "Received marker: MinSize\n");
+ return COREDUMP_MARK_MINSIZE;
+ case COREDUMP_MARK_MAXSIZE:
+ fprintf(stderr, "Received marker: MaxSize\n");
+ return COREDUMP_MARK_MAXSIZE;
+ case COREDUMP_MARK_UNSUPPORTED:
+ fprintf(stderr, "Received marker: Unsupported\n");
+ return COREDUMP_MARK_UNSUPPORTED;
+ case COREDUMP_MARK_CONFLICTING:
+ fprintf(stderr, "Received marker: Conflicting\n");
+ return COREDUMP_MARK_CONFLICTING;
+ default:
+ fprintf(stderr, "Received unknown marker: %u\n", mark);
+ break;
+ }
+ return -1;
+}
+
+bool read_marker(int fd, enum coredump_mark mark)
+{
+ ssize_t ret;
+
+ ret = recv_marker(fd);
+ if (ret < 0)
+ return false;
+ return ret == mark;
+}
+
+bool read_coredump_req(int fd, struct coredump_req *req)
+{
+ ssize_t ret;
+ size_t field_size, user_size, ack_size, kernel_size, remaining_size;
+
+ memset(req, 0, sizeof(*req));
+ field_size = sizeof(req->size);
+
+ /* Peek the size of the coredump request. */
+ ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL);
+ if (ret != field_size) {
+ fprintf(stderr, "read_coredump_req: peek failed (got %zd, expected %zu): %m\n",
+ ret, field_size);
+ return false;
+ }
+ kernel_size = req->size;
+
+ if (kernel_size < COREDUMP_ACK_SIZE_VER0) {
+ fprintf(stderr, "read_coredump_req: kernel_size %zu < min %d\n",
+ kernel_size, COREDUMP_ACK_SIZE_VER0);
+ return false;
+ }
+ if (kernel_size >= PAGE_SIZE) {
+ fprintf(stderr, "read_coredump_req: kernel_size %zu >= PAGE_SIZE %d\n",
+ kernel_size, PAGE_SIZE);
+ return false;
+ }
+
+ /* Use the minimum of user and kernel size to read the full request. */
+ user_size = sizeof(struct coredump_req);
+ ack_size = user_size < kernel_size ? user_size : kernel_size;
+ ret = recv(fd, req, ack_size, MSG_WAITALL);
+ if (ret != ack_size)
+ return false;
+
+ fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n",
+ req->size, (unsigned long long)req->mask);
+
+ if (user_size > kernel_size)
+ remaining_size = user_size - kernel_size;
+ else
+ remaining_size = kernel_size - user_size;
+
+ if (PAGE_SIZE <= remaining_size)
+ return false;
+
+ /*
+ * Discard any additional data if the kernel's request was larger than
+ * what we knew about or cared about.
+ */
+ if (remaining_size) {
+ char buffer[PAGE_SIZE];
+
+ ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL);
+ if (ret != remaining_size)
+ return false;
+ fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size);
+ }
+
+ return true;
+}
+
+bool send_coredump_ack(int fd, const struct coredump_req *req,
+ __u64 mask, size_t size_ack)
+{
+ ssize_t ret;
+ /*
+ * Wrap struct coredump_ack in a larger struct so we can
+ * simulate sending to much data to the kernel.
+ */
+ struct large_ack_for_size_testing {
+ struct coredump_ack ack;
+ char buffer[PAGE_SIZE];
+ } large_ack = {};
+
+ if (!size_ack)
+ size_ack = sizeof(struct coredump_ack) < req->size_ack ?
+ sizeof(struct coredump_ack) :
+ req->size_ack;
+ large_ack.ack.mask = mask;
+ large_ack.ack.size = size_ack;
+ ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL);
+ if (ret != size_ack)
+ return false;
+
+ fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n",
+ size_ack, (unsigned long long)mask);
+ return true;
+}
+
+bool check_coredump_req(const struct coredump_req *req, size_t min_size,
+ __u64 required_mask)
+{
+ if (req->size < min_size)
+ return false;
+ if ((req->mask & required_mask) != required_mask)
+ return false;
+ if (req->mask & ~required_mask)
+ return false;
+ return true;
+}
+
+int open_coredump_tmpfile(int fd_tmpfs_detached)
+{
+ return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600);
+}
+
+void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file)
+{
+ int epfd = -1;
+ int exit_code = EXIT_FAILURE;
+ struct epoll_event ev;
+ int flags;
+
+ /* Set socket to non-blocking mode for edge-triggered epoll */
+ flags = fcntl(fd_coredump, F_GETFL, 0);
+ if (flags < 0) {
+ fprintf(stderr, "Worker: fcntl(F_GETFL) failed: %m\n");
+ goto out;
+ }
+ if (fcntl(fd_coredump, F_SETFL, flags | O_NONBLOCK) < 0) {
+ fprintf(stderr, "Worker: fcntl(F_SETFL, O_NONBLOCK) failed: %m\n");
+ goto out;
+ }
+
+ epfd = epoll_create1(0);
+ if (epfd < 0) {
+ fprintf(stderr, "Worker: epoll_create1() failed: %m\n");
+ goto out;
+ }
+
+ ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
+ ev.data.fd = fd_coredump;
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0) {
+ fprintf(stderr, "Worker: epoll_ctl(EPOLL_CTL_ADD) failed: %m\n");
+ goto out;
+ }
+
+ for (;;) {
+ struct epoll_event events[1];
+ int n = epoll_wait(epfd, events, 1, -1);
+ if (n < 0) {
+ fprintf(stderr, "Worker: epoll_wait() failed: %m\n");
+ break;
+ }
+
+ if (events[0].events & (EPOLLIN | EPOLLRDHUP)) {
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ break;
+ fprintf(stderr, "Worker: read() failed: %m\n");
+ goto out;
+ }
+ if (bytes_read == 0)
+ goto done;
+ ssize_t bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_write != bytes_read) {
+ if (bytes_write < 0 && errno == ENOSPC)
+ continue;
+ fprintf(stderr, "Worker: write() failed (read=%zd, write=%zd): %m\n",
+ bytes_read, bytes_write);
+ goto out;
+ }
+ }
+ }
+ }
+
+done:
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "Worker: completed successfully\n");
+out:
+ if (epfd >= 0)
+ close(epfd);
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ _exit(exit_code);
+}
diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c
index a4ac80bb1003..c2e895bcc160 100644
--- a/tools/testing/selftests/coredump/stackdump_test.c
+++ b/tools/testing/selftests/coredump/stackdump_test.c
@@ -23,57 +23,15 @@
#include "../filesystems/wrappers.h"
#include "../pidfd/pidfd.h"
+#include "coredump_test.h"
+
#define STACKDUMP_FILE "stack_values"
#define STACKDUMP_SCRIPT "stackdump"
-#define NUM_THREAD_SPAWN 128
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif
-static void *do_nothing(void *)
-{
- while (1)
- pause();
-
- return NULL;
-}
-
-static void crashing_child(void)
-{
- pthread_t thread;
- int i;
-
- for (i = 0; i < NUM_THREAD_SPAWN; ++i)
- pthread_create(&thread, NULL, do_nothing, NULL);
-
- /* crash on purpose */
- i = *(int *)NULL;
-}
-
-FIXTURE(coredump)
-{
- char original_core_pattern[256];
- pid_t pid_coredump_server;
- int fd_tmpfs_detached;
-};
-
-static int create_detached_tmpfs(void)
-{
- int fd_context, fd_tmpfs;
-
- fd_context = sys_fsopen("tmpfs", 0);
- if (fd_context < 0)
- return -1;
-
- if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
- return -1;
-
- fd_tmpfs = sys_fsmount(fd_context, 0, 0);
- close(fd_context);
- return fd_tmpfs;
-}
-
FIXTURE_SETUP(coredump)
{
FILE *file;
@@ -208,1620 +166,4 @@ TEST_F_TIMEOUT(coredump, stackdump, 120)
fclose(file);
}
-static int create_and_listen_unix_socket(const char *path)
-{
- struct sockaddr_un addr = {
- .sun_family = AF_UNIX,
- };
- assert(strlen(path) < sizeof(addr.sun_path) - 1);
- strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1);
- size_t addr_len =
- offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1;
- int fd, ret;
-
- fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
- if (fd < 0)
- goto out;
-
- ret = bind(fd, (const struct sockaddr *)&addr, addr_len);
- if (ret < 0)
- goto out;
-
- ret = listen(fd, 128);
- if (ret < 0)
- goto out;
-
- return fd;
-
-out:
- if (fd >= 0)
- close(fd);
- return -1;
-}
-
-static bool set_core_pattern(const char *pattern)
-{
- int fd;
- ssize_t ret;
-
- fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC);
- if (fd < 0)
- return false;
-
- ret = write(fd, pattern, strlen(pattern));
- close(fd);
- if (ret < 0)
- return false;
-
- fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern));
- return ret == strlen(pattern);
-}
-
-static int get_peer_pidfd(int fd)
-{
- int fd_peer_pidfd;
- socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd);
- int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd,
- &fd_peer_pidfd_len);
- if (ret < 0) {
- fprintf(stderr, "%m - Failed to retrieve peer pidfd for coredump socket connection\n");
- return -1;
- }
- return fd_peer_pidfd;
-}
-
-static bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info)
-{
- memset(info, 0, sizeof(*info));
- info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
- return ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info) == 0;
-}
-
-static void
-wait_and_check_coredump_server(pid_t pid_coredump_server,
- struct __test_metadata *const _metadata,
- FIXTURE_DATA(coredump)* self)
-{
- int status;
- waitpid(pid_coredump_server, &status, 0);
- self->pid_coredump_server = -ESRCH;
- ASSERT_TRUE(WIFEXITED(status));
- ASSERT_EQ(WEXITSTATUS(status), 0);
-}
-
-TEST_F(coredump, socket)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct stat st;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- fd_core_file = creat("/tmp/coredump.file", 0644);
- if (fd_core_file < 0)
- goto out;
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read, bytes_write;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read < 0)
- goto out;
-
- if (bytes_read == 0)
- break;
-
- bytes_write = write(fd_core_file, buffer, bytes_read);
- if (bytes_read != bytes_write)
- goto out;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_core_file >= 0)
- close(fd_core_file);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_TRUE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-
- ASSERT_EQ(stat("/tmp/coredump.file", &st), 0);
- ASSERT_GT(st.st_size, 0);
- system("file /tmp/coredump.file");
-}
-
-TEST_F(coredump, socket_detect_userspace_client)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct stat st;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (info.coredump_mask & PIDFD_COREDUMPED)
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0) {
- int fd_socket;
- ssize_t ret;
- const struct sockaddr_un coredump_sk = {
- .sun_family = AF_UNIX,
- .sun_path = "/tmp/coredump.socket",
- };
- size_t coredump_sk_len =
- offsetof(struct sockaddr_un, sun_path) +
- sizeof("/tmp/coredump.socket");
-
- fd_socket = socket(AF_UNIX, SOCK_STREAM, 0);
- if (fd_socket < 0)
- _exit(EXIT_FAILURE);
-
- ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
- if (ret < 0)
- _exit(EXIT_FAILURE);
-
- close(fd_socket);
- _exit(EXIT_SUCCESS);
- }
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFEXITED(status));
- ASSERT_EQ(WEXITSTATUS(status), 0);
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-
- ASSERT_NE(stat("/tmp/coredump.file", &st), 0);
- ASSERT_EQ(errno, ENOENT);
-}
-
-TEST_F(coredump, socket_enoent)
-{
- int pidfd, status;
- pid_t pid;
-
- ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-}
-
-TEST_F(coredump, socket_no_listener)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- int ipc_sockets[2];
- char c;
- const struct sockaddr_un coredump_sk = {
- .sun_family = AF_UNIX,
- .sun_path = "/tmp/coredump.socket",
- };
- size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) +
- sizeof("/tmp/coredump.socket");
-
- ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
- if (fd_server < 0)
- goto out;
-
- ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
- if (ret < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_server >= 0)
- close(fd_server);
- close(ipc_sockets[1]);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-static ssize_t recv_marker(int fd)
-{
- enum coredump_mark mark = COREDUMP_MARK_REQACK;
- ssize_t ret;
-
- ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL);
- if (ret != sizeof(mark))
- return -1;
-
- switch (mark) {
- case COREDUMP_MARK_REQACK:
- fprintf(stderr, "Received marker: ReqAck\n");
- return COREDUMP_MARK_REQACK;
- case COREDUMP_MARK_MINSIZE:
- fprintf(stderr, "Received marker: MinSize\n");
- return COREDUMP_MARK_MINSIZE;
- case COREDUMP_MARK_MAXSIZE:
- fprintf(stderr, "Received marker: MaxSize\n");
- return COREDUMP_MARK_MAXSIZE;
- case COREDUMP_MARK_UNSUPPORTED:
- fprintf(stderr, "Received marker: Unsupported\n");
- return COREDUMP_MARK_UNSUPPORTED;
- case COREDUMP_MARK_CONFLICTING:
- fprintf(stderr, "Received marker: Conflicting\n");
- return COREDUMP_MARK_CONFLICTING;
- default:
- fprintf(stderr, "Received unknown marker: %u\n", mark);
- break;
- }
- return -1;
-}
-
-static bool read_marker(int fd, enum coredump_mark mark)
-{
- ssize_t ret;
-
- ret = recv_marker(fd);
- if (ret < 0)
- return false;
- return ret == mark;
-}
-
-static bool read_coredump_req(int fd, struct coredump_req *req)
-{
- ssize_t ret;
- size_t field_size, user_size, ack_size, kernel_size, remaining_size;
-
- memset(req, 0, sizeof(*req));
- field_size = sizeof(req->size);
-
- /* Peek the size of the coredump request. */
- ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL);
- if (ret != field_size)
- return false;
- kernel_size = req->size;
-
- if (kernel_size < COREDUMP_ACK_SIZE_VER0)
- return false;
- if (kernel_size >= PAGE_SIZE)
- return false;
-
- /* Use the minimum of user and kernel size to read the full request. */
- user_size = sizeof(struct coredump_req);
- ack_size = user_size < kernel_size ? user_size : kernel_size;
- ret = recv(fd, req, ack_size, MSG_WAITALL);
- if (ret != ack_size)
- return false;
-
- fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n",
- req->size, (unsigned long long)req->mask);
-
- if (user_size > kernel_size)
- remaining_size = user_size - kernel_size;
- else
- remaining_size = kernel_size - user_size;
-
- if (PAGE_SIZE <= remaining_size)
- return false;
-
- /*
- * Discard any additional data if the kernel's request was larger than
- * what we knew about or cared about.
- */
- if (remaining_size) {
- char buffer[PAGE_SIZE];
-
- ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL);
- if (ret != remaining_size)
- return false;
- fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size);
- }
-
- return true;
-}
-
-static bool send_coredump_ack(int fd, const struct coredump_req *req,
- __u64 mask, size_t size_ack)
-{
- ssize_t ret;
- /*
- * Wrap struct coredump_ack in a larger struct so we can
- * simulate sending to much data to the kernel.
- */
- struct large_ack_for_size_testing {
- struct coredump_ack ack;
- char buffer[PAGE_SIZE];
- } large_ack = {};
-
- if (!size_ack)
- size_ack = sizeof(struct coredump_ack) < req->size_ack ?
- sizeof(struct coredump_ack) :
- req->size_ack;
- large_ack.ack.mask = mask;
- large_ack.ack.size = size_ack;
- ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL);
- if (ret != size_ack)
- return false;
-
- fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n",
- size_ack, (unsigned long long)mask);
- return true;
-}
-
-static bool check_coredump_req(const struct coredump_req *req, size_t min_size,
- __u64 required_mask)
-{
- if (req->size < min_size)
- return false;
- if ((req->mask & required_mask) != required_mask)
- return false;
- if (req->mask & ~required_mask)
- return false;
- return true;
-}
-
-TEST_F(coredump, socket_request_kernel)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct stat st;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- fd_core_file = creat("/tmp/coredump.file", 0644);
- if (fd_core_file < 0)
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_KERNEL | COREDUMP_WAIT, 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK))
- goto out;
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read, bytes_write;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read < 0)
- goto out;
-
- if (bytes_read == 0)
- break;
-
- bytes_write = write(fd_core_file, buffer, bytes_read);
- if (bytes_read != bytes_write)
- goto out;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_core_file >= 0)
- close(fd_core_file);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_TRUE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-
- ASSERT_EQ(stat("/tmp/coredump.file", &st), 0);
- ASSERT_GT(st.st_size, 0);
- system("file /tmp/coredump.file");
-}
-
-TEST_F(coredump, socket_request_userspace)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_USERSPACE | COREDUMP_WAIT, 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK))
- goto out;
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read > 0)
- goto out;
-
- if (bytes_read < 0)
- goto out;
-
- if (bytes_read == 0)
- break;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_TRUE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_reject)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_REJECT | COREDUMP_WAIT, 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK))
- goto out;
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read > 0)
- goto out;
-
- if (bytes_read < 0)
- goto out;
-
- if (bytes_read == 0)
- break;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_invalid_flag_combination)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING))
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_unknown_flag)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED))
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_invalid_size_small)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_REJECT | COREDUMP_WAIT,
- COREDUMP_ACK_SIZE_VER0 / 2))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE))
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_invalid_size_large)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_REJECT | COREDUMP_WAIT,
- COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE))
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-static int open_coredump_tmpfile(int fd_tmpfs_detached)
-{
- return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600);
-}
-
-#define NUM_CRASHING_COREDUMPS 5
-
-TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500)
-{
- int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS];
- pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
- int exit_code = EXIT_FAILURE;
- struct coredump_req req = {};
-
- close(ipc_sockets[0]);
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0) {
- fprintf(stderr, "Failed to create and listen on unix socket\n");
- goto out;
- }
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
- fprintf(stderr, "Failed to notify parent via ipc socket\n");
- goto out;
- }
- close(ipc_sockets[1]);
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0) {
- fprintf(stderr, "accept4 failed: %m\n");
- goto out;
- }
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0) {
- fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump);
- goto out;
- }
-
- if (!get_pidfd_info(fd_peer_pidfd, &info)) {
- fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd);
- goto out;
- }
-
- if (!(info.mask & PIDFD_INFO_COREDUMP)) {
- fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd);
- goto out;
- }
- if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
- fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd);
- goto out;
- }
-
- if (!read_coredump_req(fd_coredump, &req)) {
- fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT)) {
- fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_KERNEL | COREDUMP_WAIT, 0)) {
- fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
- fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
- if (fd_core_file < 0) {
- fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read, bytes_write;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read < 0) {
- fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump);
- goto out;
- }
-
- if (bytes_read == 0)
- break;
-
- bytes_write = write(fd_core_file, buffer, bytes_read);
- if (bytes_read != bytes_write) {
- fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file);
- goto out;
- }
- }
-
- close(fd_core_file);
- close(fd_peer_pidfd);
- close(fd_coredump);
- fd_peer_pidfd = -1;
- fd_coredump = -1;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_core_file >= 0)
- close(fd_core_file);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- pid[i] = fork();
- ASSERT_GE(pid[i], 0);
- if (pid[i] == 0)
- crashing_child();
- pidfd[i] = sys_pidfd_open(pid[i], 0);
- ASSERT_GE(pidfd[i], 0);
- }
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- waitpid(pid[i], &status[i], 0);
- ASSERT_TRUE(WIFSIGNALED(status[i]));
- ASSERT_TRUE(WCOREDUMP(status[i]));
- }
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
- ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0);
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
- }
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-#define MAX_EVENTS 128
-
-static void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file)
-{
- int epfd = -1;
- int exit_code = EXIT_FAILURE;
-
- epfd = epoll_create1(0);
- if (epfd < 0)
- goto out;
-
- struct epoll_event ev;
- ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
- ev.data.fd = fd_coredump;
- if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0)
- goto out;
-
- for (;;) {
- struct epoll_event events[1];
- int n = epoll_wait(epfd, events, 1, -1);
- if (n < 0)
- break;
-
- if (events[0].events & (EPOLLIN | EPOLLRDHUP)) {
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read < 0) {
- if (errno == EAGAIN || errno == EWOULDBLOCK)
- break;
- goto out;
- }
- if (bytes_read == 0)
- goto done;
- ssize_t bytes_write = write(fd_core_file, buffer, bytes_read);
- if (bytes_write != bytes_read)
- goto out;
- }
- }
- }
-
-done:
- exit_code = EXIT_SUCCESS;
-out:
- if (epfd >= 0)
- close(epfd);
- if (fd_core_file >= 0)
- close(fd_core_file);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- _exit(exit_code);
-}
-
-TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500)
-{
- int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS];
- pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS];
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
- ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0;
- fd_server = -1;
- exit_code = EXIT_FAILURE;
- n_conns = 0;
- close(ipc_sockets[0]);
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
- close(ipc_sockets[1]);
-
- while (n_conns < NUM_CRASHING_COREDUMPS) {
- int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
- struct coredump_req req = {};
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0) {
- if (errno == EAGAIN || errno == EWOULDBLOCK)
- continue;
- goto out;
- }
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
- if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
- if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0))
- goto out;
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK))
- goto out;
- fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
- if (fd_core_file < 0)
- goto out;
- pid_t worker = fork();
- if (worker == 0) {
- close(fd_server);
- process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file);
- }
- worker_pids[n_conns] = worker;
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_core_file >= 0)
- close(fd_core_file);
- n_conns++;
- }
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_server >= 0)
- close(fd_server);
-
- // Reap all worker processes
- for (int i = 0; i < n_conns; i++) {
- int wstatus;
- if (waitpid(worker_pids[i], &wstatus, 0) < 0) {
- fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]);
- } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) {
- fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus));
- exit_code = EXIT_FAILURE;
- }
- }
-
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- pid[i] = fork();
- ASSERT_GE(pid[i], 0);
- if (pid[i] == 0)
- crashing_child();
- pidfd[i] = sys_pidfd_open(pid[i], 0);
- ASSERT_GE(pidfd[i], 0);
- }
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- ASSERT_GE(waitpid(pid[i], &status[i], 0), 0);
- ASSERT_TRUE(WIFSIGNALED(status[i]));
- ASSERT_TRUE(WCOREDUMP(status[i]));
- }
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
- ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0);
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
- }
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_invalid_paths)
-{
- ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@../coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/.."));
- ASSERT_FALSE(set_core_pattern("@.."));
-
- ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@@../coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/.."));
- ASSERT_FALSE(set_core_pattern("@@.."));
-
- ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket"));
-}
-
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c
index b12f1f9babf8..b925756373ce 100644
--- a/tools/testing/selftests/dma/dma_map_benchmark.c
+++ b/tools/testing/selftests/dma/dma_map_benchmark.c
@@ -118,7 +118,7 @@ int main(int argc, char **argv)
}
printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n",
- threads, seconds, node, dir[directions], granule);
+ threads, seconds, node, directions[dir], granule);
printf("average map latency(us):%.1f standard deviation:%.1f\n",
map.avg_map_100ns/10.0, map.map_stddev/10.0);
printf("average unmap latency(us):%.1f standard deviation:%.1f\n",
diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile
index 6e41635bd55a..71ee69e524d7 100644
--- a/tools/testing/selftests/drivers/net/Makefile
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -18,6 +18,7 @@ TEST_PROGS := \
netcons_fragmented_msg.sh \
netcons_overflow.sh \
netcons_sysdata.sh \
+ netcons_torture.sh \
netpoll_basic.py \
ping.py \
psp.py \
diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile
index 402d4ee84f2e..6c5c60adb5e8 100644
--- a/tools/testing/selftests/drivers/net/bonding/Makefile
+++ b/tools/testing/selftests/drivers/net/bonding/Makefile
@@ -14,6 +14,7 @@ TEST_PROGS := \
dev_addr_lists.sh \
mode-1-recovery-updelay.sh \
mode-2-recovery-updelay.sh \
+ netcons_over_bonding.sh \
# end of TEST_PROGS
TEST_FILES := \
@@ -24,6 +25,7 @@ TEST_FILES := \
TEST_INCLUDES := \
../../../net/lib.sh \
+ ../lib/sh/lib_netcons.sh \
../../../net/forwarding/lib.sh \
# end of TEST_INCLUDES
diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config
index 6bb290abd48b..991494376223 100644
--- a/tools/testing/selftests/drivers/net/bonding/config
+++ b/tools/testing/selftests/drivers/net/bonding/config
@@ -1,5 +1,6 @@
CONFIG_BONDING=y
CONFIG_BRIDGE=y
+CONFIG_CONFIGFS_FS=y
CONFIG_DUMMY=y
CONFIG_INET_ESP=y
CONFIG_INET_ESP_OFFLOAD=y
@@ -9,6 +10,9 @@ CONFIG_MACVLAN=y
CONFIG_NET_ACT_GACT=y
CONFIG_NET_CLS_FLOWER=y
CONFIG_NET_CLS_MATCHALL=m
+CONFIG_NETCONSOLE=m
+CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_NETCONSOLE_EXTENDED_LOG=y
CONFIG_NETDEVSIM=m
CONFIG_NET_SCH_INGRESS=y
CONFIG_NLMON=y
diff --git a/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh
new file mode 100755
index 000000000000..477cc9379500
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh
@@ -0,0 +1,361 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This selftest exercises trying to have multiple netpoll users at the same
+# time.
+#
+# This selftest has multiple smalls test inside, and the goal is to
+# get interfaces with bonding and netconsole in different orders in order
+# to catch any possible issue.
+#
+# The main test composes of four interfaces being created using netdevsim; two
+# of them are bonded to serve as the netconsole's transmit interface. The
+# remaining two interfaces are similarly bonded and assigned to a separate
+# network namespace, which acts as the receive interface, where socat monitors
+# for incoming messages.
+#
+# A netconsole message is then sent to ensure it is properly received across
+# this configuration.
+#
+# Later, run a few other tests, to make sure that bonding and netconsole
+# cannot coexist.
+#
+# The test's objective is to exercise netpoll usage when managed simultaneously
+# by multiple subsystems (netconsole and bonding).
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh
+
+modprobe netdevsim 2> /dev/null || true
+modprobe netconsole 2> /dev/null || true
+modprobe bonding 2> /dev/null || true
+modprobe veth 2> /dev/null || true
+
+# The content of kmsg will be save to the following file
+OUTPUT_FILE="/tmp/${TARGET}"
+
+# Check for basic system dependency and exit if not found
+check_for_dependencies
+# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
+echo "6 5" > /proc/sys/kernel/printk
+# Remove the namespace, interfaces and netconsole target on exit
+trap cleanup_bond EXIT
+
+FORMAT="extended"
+IP_VERSION="ipv4"
+VETH0="veth"$(( RANDOM % 256))
+VETH1="veth"$((256 + RANDOM % 256))
+TXNS=""
+RXNS=""
+
+# Create "bond_tx_XX" and "bond_rx_XX" interfaces, and set DSTIF and SRCIF with
+# the bonding interfaces
+function setup_bonding_ifaces() {
+ local RAND=$(( RANDOM % 100 ))
+ BOND_TX_MAIN_IF="bond_tx_$RAND"
+ BOND_RX_MAIN_IF="bond_rx_$RAND"
+
+ # Setup TX
+ if ! ip -n "${TXNS}" link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr
+ then
+ echo "Failed to create bond TX interface. Is CONFIG_BONDING set?" >&2
+ # only clean nsim ifaces and namespace. Nothing else has been
+ # initialized
+ cleanup_bond_nsim
+ trap - EXIT
+ exit "${ksft_skip}"
+ fi
+
+ # create_netdevsim() got the interface up, but it needs to be down
+ # before being enslaved.
+ ip -n "${TXNS}" \
+ link set "${BOND_TX1_SLAVE_IF}" down
+ ip -n "${TXNS}" \
+ link set "${BOND_TX2_SLAVE_IF}" down
+ ip -n "${TXNS}" \
+ link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}"
+ ip -n "${TXNS}" \
+ link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}"
+ ip -n "${TXNS}" \
+ link set "${BOND_TX_MAIN_IF}" up
+
+ # Setup RX
+ ip -n "${RXNS}" \
+ link add "${BOND_RX_MAIN_IF}" type bond mode balance-rr
+ ip -n "${RXNS}" \
+ link set "${BOND_RX1_SLAVE_IF}" down
+ ip -n "${RXNS}" \
+ link set "${BOND_RX2_SLAVE_IF}" down
+ ip -n "${RXNS}" \
+ link set "${BOND_RX1_SLAVE_IF}" master "${BOND_RX_MAIN_IF}"
+ ip -n "${RXNS}" \
+ link set "${BOND_RX2_SLAVE_IF}" master "${BOND_RX_MAIN_IF}"
+ ip -n "${RXNS}" \
+ link set "${BOND_RX_MAIN_IF}" up
+
+ export DSTIF="${BOND_RX_MAIN_IF}"
+ export SRCIF="${BOND_TX_MAIN_IF}"
+}
+
+# Create 4 netdevsim interfaces. Two of them will be bound to TX bonding iface
+# and the other two will be bond to the RX interface (on the other namespace)
+function create_ifaces_bond() {
+ BOND_TX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_1}" "${TXNS}")
+ BOND_TX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_2}" "${TXNS}")
+ BOND_RX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_1}" "${RXNS}")
+ BOND_RX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_2}" "${RXNS}")
+}
+
+# netdevsim link BOND_TX to BOND_RX interfaces
+function link_ifaces_bond() {
+ local BOND_TX1_SLAVE_IFIDX
+ local BOND_TX2_SLAVE_IFIDX
+ local BOND_RX1_SLAVE_IFIDX
+ local BOND_RX2_SLAVE_IFIDX
+ local TXNS_FD
+ local RXNS_FD
+
+ BOND_TX1_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \
+ cat /sys/class/net/"$BOND_TX1_SLAVE_IF"/ifindex)
+ BOND_TX2_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \
+ cat /sys/class/net/"$BOND_TX2_SLAVE_IF"/ifindex)
+ BOND_RX1_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \
+ cat /sys/class/net/"$BOND_RX1_SLAVE_IF"/ifindex)
+ BOND_RX2_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \
+ cat /sys/class/net/"$BOND_RX2_SLAVE_IF"/ifindex)
+
+ exec {TXNS_FD}</var/run/netns/"${TXNS}"
+ exec {RXNS_FD}</var/run/netns/"${RXNS}"
+
+ # Linking TX ifaces to the RX ones (on the other namespace)
+ echo "${TXNS_FD}:$BOND_TX1_SLAVE_IFIDX $RXNS_FD:$BOND_RX1_SLAVE_IFIDX" \
+ > "$NSIM_DEV_SYS_LINK"
+ echo "${TXNS_FD}:$BOND_TX2_SLAVE_IFIDX $RXNS_FD:$BOND_RX2_SLAVE_IFIDX" \
+ > "$NSIM_DEV_SYS_LINK"
+
+ exec {TXNS_FD}<&-
+ exec {RXNS_FD}<&-
+}
+
+function create_all_ifaces() {
+ # setup_ns function is coming from lib.sh
+ setup_ns TXNS RXNS
+ export NAMESPACE="${RXNS}"
+
+ # Create two interfaces for RX and two for TX
+ create_ifaces_bond
+ # Link netlink ifaces
+ link_ifaces_bond
+}
+
+# configure DSTIF and SRCIF IPs
+function configure_ifaces_ips() {
+ local IP_VERSION=${1:-"ipv4"}
+ select_ipv4_or_ipv6 "${IP_VERSION}"
+
+ ip -n "${RXNS}" addr add "${DSTIP}"/24 dev "${DSTIF}"
+ ip -n "${RXNS}" link set "${DSTIF}" up
+
+ ip -n "${TXNS}" addr add "${SRCIP}"/24 dev "${SRCIF}"
+ ip -n "${TXNS}" link set "${SRCIF}" up
+}
+
+function test_enable_netpoll_on_enslaved_iface() {
+ echo 0 > "${NETCONS_PATH}"/enabled
+
+ # At this stage, BOND_TX1_SLAVE_IF is enslaved to BOND_TX_MAIN_IF, and
+ # linked to BOND_RX1_SLAVE_IF inside the namespace.
+ echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name
+
+ # This should fail with the following message in dmesg:
+ # netpoll: netconsole: ethX is a slave device, aborting
+ set +e
+ enable_netcons_ns 2> /dev/null
+ set -e
+
+ if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]]
+ then
+ echo "test failed: Bonding and netpoll cannot co-exists." >&2
+ exit "${ksft_fail}"
+ fi
+}
+
+function test_delete_bond_and_reenable_target() {
+ ip -n "${TXNS}" \
+ link delete "${BOND_TX_MAIN_IF}" type bond
+
+ # BOND_TX1_SLAVE_IF is not attached to a bond interface anymore
+ # netpoll can be plugged in there
+ echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name
+
+ # this should work, since the interface is not enslaved
+ enable_netcons_ns
+
+ if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]]
+ then
+ echo "test failed: Unable to start netpoll on an unbond iface." >&2
+ exit "${ksft_fail}"
+ fi
+}
+
+# Send a netconsole message to the netconsole target
+function test_send_netcons_msg_through_bond_iface() {
+ # Listen for netconsole port inside the namespace and
+ # destination interface
+ listen_port_and_save_to "${OUTPUT_FILE}" "${IP_VERSION}" &
+ # Wait for socat to start and listen to the port.
+ wait_for_port "${RXNS}" "${PORT}" "${IP_VERSION}"
+ # Send the message
+ echo "${MSG}: ${TARGET}" > /dev/kmsg
+ # Wait until socat saves the file to disk
+ busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
+ # Make sure the message was received in the dst part
+ # and exit
+ validate_result "${OUTPUT_FILE}" "${FORMAT}"
+ # kill socat in case it is still running
+ pkill_socat
+}
+
+# BOND_TX1_SLAVE_IF has netconsole enabled on it, bind it to BOND_TX_MAIN_IF.
+# Given BOND_TX_MAIN_IF was deleted, recreate it first
+function test_enslave_netcons_enabled_iface {
+ # netconsole got disabled while the interface was down
+ if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]]
+ then
+ echo "test failed: netconsole expected to be enabled against BOND_TX1_SLAVE_IF" >&2
+ exit "${ksft_fail}"
+ fi
+
+ # recreate the bonding iface. it got deleted by previous
+ # test (test_delete_bond_and_reenable_target)
+ ip -n "${TXNS}" \
+ link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr
+
+ # sub-interface need to be down before attaching to bonding
+ # This will also disable netconsole.
+ ip -n "${TXNS}" \
+ link set "${BOND_TX1_SLAVE_IF}" down
+ ip -n "${TXNS}" \
+ link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}"
+ ip -n "${TXNS}" \
+ link set "${BOND_TX_MAIN_IF}" up
+
+ # netconsole got disabled while the interface was down
+ if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]]
+ then
+ echo "test failed: Device is part of a bond iface, cannot have netcons enabled" >&2
+ exit "${ksft_fail}"
+ fi
+}
+
+# Get netconsole enabled on a bonding interface and attach a second
+# sub-interface.
+function test_enslave_iface_to_bond {
+ # BOND_TX_MAIN_IF has only BOND_TX1_SLAVE_IF right now
+ echo "${BOND_TX_MAIN_IF}" > "${NETCONS_PATH}"/dev_name
+ enable_netcons_ns
+
+ # netcons is attached to bond0 and BOND_TX1_SLAVE_IF is
+ # part of BOND_TX_MAIN_IF. Attach BOND_TX2_SLAVE_IF to BOND_TX_MAIN_IF.
+ ip -n "${TXNS}" \
+ link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}"
+ if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]]
+ then
+ echo "test failed: Netconsole should be enabled on bonding interface. Failed" >&2
+ exit "${ksft_fail}"
+ fi
+}
+
+function test_enslave_iff_disabled_netpoll_iface {
+ local ret
+
+ # Create two interfaces. veth interfaces it known to have
+ # IFF_DISABLE_NETPOLL set
+ if ! ip link add "${VETH0}" type veth peer name "${VETH1}"
+ then
+ echo "Failed to create veth TX interface. Is CONFIG_VETH set?" >&2
+ exit "${ksft_skip}"
+ fi
+ set +e
+ # This will print RTNETLINK answers: Device or resource busy
+ ip link set "${VETH0}" master "${BOND_TX_MAIN_IF}" 2> /dev/null
+ ret=$?
+ set -e
+ if [[ $ret -eq 0 ]]
+ then
+ echo "test failed: veth interface could not be enslaved"
+ exit "${ksft_fail}"
+ fi
+}
+
+# Given that netconsole picks the current net namespace, we need to enable it
+# from inside the TXNS namespace
+function enable_netcons_ns() {
+ ip netns exec "${TXNS}" sh -c \
+ "mount -t configfs configfs /sys/kernel/config && echo 1 > $NETCONS_PATH/enabled"
+}
+
+####################
+# Tests start here #
+####################
+
+# Create regular interfaces using netdevsim and link them
+create_all_ifaces
+
+# Setup the bonding interfaces
+# BOND_RX_MAIN_IF has BOND_RX{1,2}_SLAVE_IF
+# BOND_TX_MAIN_IF has BOND_TX{1,2}_SLAVE_IF
+setup_bonding_ifaces
+
+# Configure the ips as BOND_RX1_SLAVE_IF and BOND_TX1_SLAVE_IF
+configure_ifaces_ips "${IP_VERSION}"
+
+_create_dynamic_target "${FORMAT}" "${NETCONS_PATH}"
+enable_netcons_ns
+set_user_data
+
+# Test #1 : Create an bonding interface and attach netpoll into
+# the bonding interface. Netconsole/netpoll should work on
+# the bonding interface.
+test_send_netcons_msg_through_bond_iface
+echo "test #1: netpoll on bonding interface worked. Test passed" >&2
+
+# Test #2: Attach netpoll to an enslaved interface
+# Try to attach netpoll to an enslaved sub-interface (while still being part of
+# a bonding interface), which shouldn't be allowed
+test_enable_netpoll_on_enslaved_iface
+echo "test #2: netpoll correctly rejected enslaved interface (expected behavior). Test passed." >&2
+
+# Test #3: Unplug the sub-interface from bond and enable netconsole
+# Detach the interface from a bonding interface and attach netpoll again
+test_delete_bond_and_reenable_target
+echo "test #3: Able to attach to an unbound interface. Test passed." >&2
+
+# Test #4: Enslave a sub-interface that had netconsole enabled
+# Try to enslave an interface that has netconsole/netpoll enabled.
+# Previous test has netconsole enabled in BOND_TX1_SLAVE_IF, try to enslave it
+test_enslave_netcons_enabled_iface
+echo "test #4: Enslaving an interface with netpoll attached. Test passed." >&2
+
+# Test #5: Enslave a sub-interface to a bonding interface
+# Enslave an interface to a bond interface that has netpoll attached
+# At this stage, BOND_TX_MAIN_IF is created and BOND_TX1_SLAVE_IF is part of
+# it. Netconsole is currently disabled
+test_enslave_iface_to_bond
+echo "test #5: Enslaving an interface to bond+netpoll. Test passed." >&2
+
+# Test #6: Enslave a IFF_DISABLE_NETPOLL sub-interface to a bonding interface
+# At this stage, BOND_TX_MAIN_IF has both sub interface and netconsole is
+# enabled. This test will try to enslave an a veth (IFF_DISABLE_NETPOLL) interface
+# and it should fail, with netpoll: veth0 doesn't support polling
+test_enslave_iff_disabled_netpoll_iface
+echo "test #6: Enslaving IFF_DISABLE_NETPOLL ifaces to bond iface is not supported. Test passed." >&2
+
+cleanup_bond
+trap - EXIT
+exit "${EXIT_STATUS}"
diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
index 8e1085e89647..87f89fd92f8c 100644
--- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
+++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
@@ -11,9 +11,11 @@ set -euo pipefail
LIBDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
SRCIF="" # to be populated later
+SRCIP="" # to be populated later
SRCIP4="192.0.2.1"
SRCIP6="fc00::1"
DSTIF="" # to be populated later
+DSTIP="" # to be populated later
DSTIP4="192.0.2.2"
DSTIP6="fc00::2"
@@ -28,17 +30,23 @@ NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}"
# NAMESPACE will be populated by setup_ns with a random value
NAMESPACE=""
-# IDs for netdevsim
+# IDs for netdevsim. We either use NSIM_DEV_{1,2}_ID for standard test
+# or NSIM_BOND_{T,R}X_{1,2} for the bonding tests. Not both at the
+# same time.
NSIM_DEV_1_ID=$((256 + RANDOM % 256))
NSIM_DEV_2_ID=$((512 + RANDOM % 256))
+NSIM_BOND_TX_1=$((768 + RANDOM % 256))
+NSIM_BOND_TX_2=$((1024 + RANDOM % 256))
+NSIM_BOND_RX_1=$((1280 + RANDOM % 256))
+NSIM_BOND_RX_2=$((1536 + RANDOM % 256))
NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device"
+NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device"
# Used to create and delete namespaces
source "${LIBDIR}"/../../../../net/lib.sh
# Create netdevsim interfaces
create_ifaces() {
-
echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW"
echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW"
udevadm settle 2> /dev/null || true
@@ -113,31 +121,38 @@ function set_network() {
configure_ip
}
-function create_dynamic_target() {
- local FORMAT=${1:-"extended"}
+function _create_dynamic_target() {
+ local FORMAT="${1:?FORMAT parameter required}"
+ local NCPATH="${2:?NCPATH parameter required}"
DSTMAC=$(ip netns exec "${NAMESPACE}" \
ip link show "${DSTIF}" | awk '/ether/ {print $2}')
# Create a dynamic target
- mkdir "${NETCONS_PATH}"
+ mkdir "${NCPATH}"
- echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip
- echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip
- echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac
- echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name
+ echo "${DSTIP}" > "${NCPATH}"/remote_ip
+ echo "${SRCIP}" > "${NCPATH}"/local_ip
+ echo "${DSTMAC}" > "${NCPATH}"/remote_mac
+ echo "${SRCIF}" > "${NCPATH}"/dev_name
if [ "${FORMAT}" == "basic" ]
then
# Basic target does not support release
- echo 0 > "${NETCONS_PATH}"/release
- echo 0 > "${NETCONS_PATH}"/extended
+ echo 0 > "${NCPATH}"/release
+ echo 0 > "${NCPATH}"/extended
elif [ "${FORMAT}" == "extended" ]
then
- echo 1 > "${NETCONS_PATH}"/extended
+ echo 1 > "${NCPATH}"/extended
fi
+}
- echo 1 > "${NETCONS_PATH}"/enabled
+function create_dynamic_target() {
+ local FORMAT=${1:-"extended"}
+ local NCPATH=${2:-"$NETCONS_PATH"}
+ _create_dynamic_target "${FORMAT}" "${NCPATH}"
+
+ echo 1 > "${NCPATH}"/enabled
# This will make sure that the kernel was able to
# load the netconsole driver configuration. The console message
@@ -185,14 +200,26 @@ function do_cleanup() {
echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk
}
-function cleanup() {
+function cleanup_netcons() {
# delete netconsole dynamic reconfiguration
- echo 0 > "${NETCONS_PATH}"/enabled
+ # do not fail if the target is already disabled
+ if [[ ! -d "${NETCONS_PATH}" ]]
+ then
+ # in some cases this is called before netcons path is created
+ return
+ fi
+ if [[ $(cat "${NETCONS_PATH}"/enabled) != 0 ]]
+ then
+ echo 0 > "${NETCONS_PATH}"/enabled || true
+ fi
# Remove all the keys that got created during the selftest
find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete
# Remove the configfs entry
rmdir "${NETCONS_PATH}"
+}
+function cleanup() {
+ cleanup_netcons
do_cleanup
}
@@ -369,3 +396,24 @@ function wait_for_port() {
# more frequently on IPv6
sleep 1
}
+
+# Clean up netdevsim ifaces created for bonding test
+function cleanup_bond_nsim() {
+ ip -n "${TXNS}" \
+ link delete "${BOND_TX_MAIN_IF}" type bond || true
+ ip -n "${RXNS}" \
+ link delete "${BOND_RX_MAIN_IF}" type bond || true
+
+ cleanup_netdevsim "$NSIM_BOND_TX_1"
+ cleanup_netdevsim "$NSIM_BOND_TX_2"
+ cleanup_netdevsim "$NSIM_BOND_RX_1"
+ cleanup_netdevsim "$NSIM_BOND_RX_2"
+}
+
+# cleanup tests that use bonding interfaces
+function cleanup_bond() {
+ cleanup_netcons
+ cleanup_bond_nsim
+ cleanup_all_ns
+ ip link delete "${VETH0}" || true
+}
diff --git a/tools/testing/selftests/drivers/net/netcons_torture.sh b/tools/testing/selftests/drivers/net/netcons_torture.sh
new file mode 100755
index 000000000000..2ce9ee3719d1
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netcons_torture.sh
@@ -0,0 +1,130 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Repeatedly send kernel messages, toggles netconsole targets on and off,
+# creates and deletes targets in parallel, and toggles the source interface to
+# simulate stress conditions.
+#
+# This test aims to verify the robustness of netconsole under dynamic
+# configurations and concurrent operations.
+#
+# The major goal is to run this test with LOCKDEP, Kmemleak and KASAN to make
+# sure no issues is reported.
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
+
+# Number of times the main loop run
+ITERATIONS=${1:-150}
+
+# Only test extended format
+FORMAT="extended"
+# And ipv6 only
+IP_VERSION="ipv6"
+
+# Create, enable and delete some targets.
+create_and_delete_random_target() {
+ COUNT=2
+ RND_PREFIX=$(mktemp -u netcons_rnd_XXXX_)
+
+ if [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}${COUNT}" ] || \
+ [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}0" ]; then
+ echo "Function didn't finish yet, skipping it." >&2
+ return
+ fi
+
+ # enable COUNT targets
+ for i in $(seq ${COUNT})
+ do
+ RND_TARGET="${RND_PREFIX}"${i}
+ RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}"
+
+ # Basic population so the target can come up
+ _create_dynamic_target "${FORMAT}" "${RND_TARGET_PATH}"
+ done
+
+ echo "netconsole selftest: ${COUNT} additional targets were created" > /dev/kmsg
+ # disable them all
+ for i in $(seq ${COUNT})
+ do
+ RND_TARGET="${RND_PREFIX}"${i}
+ RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}"
+ if [[ $(cat "${RND_TARGET_PATH}/enabled") -eq 1 ]]
+ then
+ echo 0 > "${RND_TARGET_PATH}"/enabled
+ fi
+ rmdir "${RND_TARGET_PATH}"
+ done
+}
+
+# Disable and enable the target mid-air, while messages
+# are being transmitted.
+toggle_netcons_target() {
+ for i in $(seq 2)
+ do
+ if [ ! -d "${NETCONS_PATH}" ]
+ then
+ break
+ fi
+ echo 0 > "${NETCONS_PATH}"/enabled 2> /dev/null || true
+ # Try to enable a bit harder, given it might fail to enable
+ # Write to `enabled` might fail depending on the lock, which is
+ # highly contentious here
+ for _ in $(seq 5)
+ do
+ echo 1 > "${NETCONS_PATH}"/enabled 2> /dev/null || true
+ done
+ done
+}
+
+toggle_iface(){
+ ip link set "${SRCIF}" down
+ ip link set "${SRCIF}" up
+}
+
+# Start here
+
+modprobe netdevsim 2> /dev/null || true
+modprobe netconsole 2> /dev/null || true
+
+# Check for basic system dependency and exit if not found
+check_for_dependencies
+# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
+echo "6 5" > /proc/sys/kernel/printk
+# Remove the namespace, interfaces and netconsole target on exit
+trap cleanup EXIT
+# Create one namespace and two interfaces
+set_network "${IP_VERSION}"
+# Create a dynamic target for netconsole
+create_dynamic_target "${FORMAT}"
+
+for i in $(seq "$ITERATIONS")
+do
+ for _ in $(seq 10)
+ do
+ echo "${MSG}: ${TARGET} ${i}" > /dev/kmsg
+ done
+ wait
+
+ if (( i % 30 == 0 )); then
+ toggle_netcons_target &
+ fi
+
+ if (( i % 50 == 0 )); then
+ # create some targets, enable them, send msg and disable
+ # all in a parallel thread
+ create_and_delete_random_target &
+ fi
+
+ if (( i % 70 == 0 )); then
+ toggle_iface &
+ fi
+done
+wait
+
+exit "${EXIT_STATUS}"
diff --git a/tools/testing/selftests/drivers/net/netdevsim/Makefile b/tools/testing/selftests/drivers/net/netdevsim/Makefile
index daf51113c827..df10c7243511 100644
--- a/tools/testing/selftests/drivers/net/netdevsim/Makefile
+++ b/tools/testing/selftests/drivers/net/netdevsim/Makefile
@@ -20,4 +20,8 @@ TEST_PROGS := \
udp_tunnel_nic.sh \
# end of TEST_PROGS
+TEST_FILES := \
+ ethtool-common.sh
+# end of TEST_FILES
+
include ../../../lib.mk
diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c
index c43a69dffd83..a0c64f415a7f 100644
--- a/tools/testing/selftests/filesystems/utils.c
+++ b/tools/testing/selftests/filesystems/utils.c
@@ -487,7 +487,7 @@ int setup_userns(void)
uid_t uid = getuid();
gid_t gid = getgid();
- ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID);
+ ret = unshare(CLONE_NEWNS|CLONE_NEWUSER);
if (ret) {
ksft_exit_fail_msg("unsharing mountns and userns: %s\n",
strerror(errno));
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc
new file mode 100644
index 000000000000..7daf7292209e
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc
@@ -0,0 +1,107 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Basic tests on writing to trace_marker_raw
+# requires: trace_marker_raw
+# flags: instance
+
+is_little_endian() {
+ if lscpu | grep -q 'Little Endian'; then
+ echo 1;
+ else
+ echo 0;
+ fi
+}
+
+little=`is_little_endian`
+
+make_str() {
+ id=$1
+ cnt=$2
+
+ if [ $little -eq 1 ]; then
+ val=`printf "\\%03o\\%03o\\%03o\\%03o" \
+ $(($id & 0xff)) \
+ $((($id >> 8) & 0xff)) \
+ $((($id >> 16) & 0xff)) \
+ $((($id >> 24) & 0xff))`
+ else
+ val=`printf "\\%03o\\%03o\\%03o\\%03o" \
+ $((($id >> 24) & 0xff)) \
+ $((($id >> 16) & 0xff)) \
+ $((($id >> 8) & 0xff)) \
+ $(($id & 0xff))`
+ fi
+
+ data=`printf -- 'X%.0s' $(seq $cnt)`
+
+ printf "${val}${data}"
+}
+
+write_buffer() {
+ id=$1
+ size=$2
+
+ # write the string into the raw marker
+ make_str $id $size > trace_marker_raw
+}
+
+
+test_multiple_writes() {
+
+ # Write a bunch of data where the id is the count of
+ # data to write
+ for i in `seq 1 10` `seq 101 110` `seq 1001 1010`; do
+ write_buffer $i $i
+ done
+
+ # add a little buffer
+ echo stop > trace_marker
+
+ # Check to make sure the number of entries is the id (rounded up by 4)
+ awk '/.*: # [0-9a-f]* / {
+ print;
+ cnt = -1;
+ for (i = 0; i < NF; i++) {
+ # The counter is after the "#" marker
+ if ( $i == "#" ) {
+ i++;
+ cnt = strtonum("0x" $i);
+ num = NF - (i + 1);
+ # The number of items is always rounded up by 4
+ cnt2 = int((cnt + 3) / 4) * 4;
+ if (cnt2 != num) {
+ exit 1;
+ }
+ break;
+ }
+ }
+ }
+ // { if (NR > 30) { exit 0; } } ' trace_pipe;
+}
+
+
+get_buffer_data_size() {
+ sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page
+}
+
+test_buffer() {
+
+ # The id must be four bytes, test that 3 bytes fails a write
+ if echo -n abc > ./trace_marker_raw ; then
+ echo "Too small of write expected to fail but did not"
+ exit_fail
+ fi
+
+ size=`get_buffer_data_size`
+ echo size = $size
+
+ # Now add a little more than what it can handle
+
+ if write_buffer 0xdeadbeef $size ; then
+ echo "Too big of write expected to fail but did not"
+ exit_fail
+ fi
+}
+
+test_buffer
+test_multiple_writes
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc
index 2506f464811b..47067a5e3cb0 100644
--- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc
@@ -28,25 +28,21 @@ test -d events/fprobes/myevent1
test -d events/fprobes/myevent2
echo 1 > events/fprobes/myevent1/enable
-# Make sure the event is attached and is the only one
+# Make sure the event is attached.
grep -q $PLACE enabled_functions
cnt=`cat enabled_functions | wc -l`
-if [ $cnt -ne $((ocnt + 1)) ]; then
+if [ $cnt -eq $ocnt ]; then
exit_fail
fi
echo 1 > events/fprobes/myevent2/enable
-# It should till be the only attached function
-cnt=`cat enabled_functions | wc -l`
-if [ $cnt -ne $((ocnt + 1)) ]; then
- exit_fail
-fi
+cnt2=`cat enabled_functions | wc -l`
echo 1 > events/fprobes/myevent3/enable
# If the function is different, the attached function should be increased
grep -q $PLACE2 enabled_functions
cnt=`cat enabled_functions | wc -l`
-if [ $cnt -ne $((ocnt + 2)) ]; then
+if [ $cnt -eq $cnt2 ]; then
exit_fail
fi
@@ -56,12 +52,6 @@ echo "-:myevent2" >> dynamic_events
grep -q myevent1 dynamic_events
! grep -q myevent2 dynamic_events
-# should still have 2 left
-cnt=`cat enabled_functions | wc -l`
-if [ $cnt -ne $((ocnt + 2)) ]; then
- exit_fail
-fi
-
echo 0 > events/fprobes/enable
echo > dynamic_events
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/enable_disable_tprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/enable_disable_tprobe.tc
new file mode 100644
index 000000000000..c1f1cafa30f3
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/enable_disable_tprobe.tc
@@ -0,0 +1,40 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Generic dynamic event - enable/disable tracepoint probe events
+# requires: dynamic_events "t[:[<group>/][<event>]] <tracepoint> [<args>]":README
+
+echo 0 > events/enable
+echo > dynamic_events
+
+TRACEPOINT=sched_switch
+ENABLEFILE=events/tracepoints/myprobe/enable
+
+:;: "Add tracepoint event on $TRACEPOINT" ;:
+
+echo "t:myprobe ${TRACEPOINT}" >> dynamic_events
+
+:;: "Check enable/disable to ensure it works" ;:
+
+echo 1 > $ENABLEFILE
+
+grep -q $TRACEPOINT trace
+
+echo 0 > $ENABLEFILE
+
+echo > trace
+
+! grep -q $TRACEPOINT trace
+
+:;: "Repeat enable/disable to ensure it works" ;:
+
+echo 1 > $ENABLEFILE
+
+grep -q $TRACEPOINT trace
+
+echo 0 > $ENABLEFILE
+
+echo > trace
+
+! grep -q $TRACEPOINT trace
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
index c62165fabd0c..cfa16aa1f39a 100644
--- a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
+++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
@@ -20,6 +20,10 @@ sample_events() {
echo 0 > tracing_on
echo 0 > events/enable
+# Clear functions caused by page cache; run sample_events twice
+sample_events
+sample_events
+
echo "Get the most frequently calling function"
echo > trace
sample_events
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 3eebf5e3b974..bb4d33dde3c8 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -2638,6 +2638,8 @@ TEST_F(vfio_compat_mock_domain, map)
ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd));
ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd));
ASSERT_EQ(BUFFER_SIZE, unmap_cmd.size);
+ /* Unmap of empty is success */
+ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd));
/* UNMAP_FLAG_ALL requires 0 iova/size */
ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd));
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 772ca1db6e59..9f472c20c190 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -1044,8 +1044,8 @@ static int _test_cmd_trigger_vevents(int fd, __u32 dev_id, __u32 nvevents)
};
while (nvevents--) {
- if (!ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_VEVENT),
- &trigger_vevent_cmd))
+ if (ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_VEVENT),
+ &trigger_vevent_cmd))
return -1;
}
return 0;
diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh
index 2c3c58e65a41..3a62039fa621 100644
--- a/tools/testing/selftests/kselftest/runner.sh
+++ b/tools/testing/selftests/kselftest/runner.sh
@@ -44,6 +44,12 @@ tap_timeout()
fi
}
+report_failure()
+{
+ echo "not ok $*"
+ echo "$*" >> "$kselftest_failures_file"
+}
+
run_one()
{
DIR="$1"
@@ -105,7 +111,7 @@ run_one()
echo "# $TEST_HDR_MSG"
if [ ! -e "$TEST" ]; then
echo "# Warning: file $TEST is missing!"
- echo "not ok $test_num $TEST_HDR_MSG"
+ report_failure "$test_num $TEST_HDR_MSG"
else
if [ -x /usr/bin/stdbuf ]; then
stdbuf="/usr/bin/stdbuf --output=L "
@@ -123,7 +129,7 @@ run_one()
interpreter=$(head -n 1 "$TEST" | cut -c 3-)
cmd="$stdbuf $interpreter ./$BASENAME_TEST"
else
- echo "not ok $test_num $TEST_HDR_MSG"
+ report_failure "$test_num $TEST_HDR_MSG"
return
fi
fi
@@ -137,9 +143,9 @@ run_one()
echo "ok $test_num $TEST_HDR_MSG # SKIP"
elif [ $rc -eq $timeout_rc ]; then \
echo "#"
- echo "not ok $test_num $TEST_HDR_MSG # TIMEOUT $kselftest_timeout seconds"
+ report_failure "$test_num $TEST_HDR_MSG # TIMEOUT $kselftest_timeout seconds"
else
- echo "not ok $test_num $TEST_HDR_MSG # exit=$rc"
+ report_failure "$test_num $TEST_HDR_MSG # exit=$rc"
fi)
cd - >/dev/null
fi
diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c
index c9b84eeaab6b..0a3a94c4cca1 100644
--- a/tools/testing/selftests/kvm/arm64/get-reg-list.c
+++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c
@@ -63,11 +63,13 @@ static struct feature_id_reg feat_id_regs[] = {
REG_FEAT(HDFGWTR2_EL2, ID_AA64MMFR0_EL1, FGT, FGT2),
REG_FEAT(ZCR_EL2, ID_AA64PFR0_EL1, SVE, IMP),
REG_FEAT(SCTLR2_EL1, ID_AA64MMFR3_EL1, SCTLRX, IMP),
+ REG_FEAT(SCTLR2_EL2, ID_AA64MMFR3_EL1, SCTLRX, IMP),
REG_FEAT(VDISR_EL2, ID_AA64PFR0_EL1, RAS, IMP),
REG_FEAT(VSESR_EL2, ID_AA64PFR0_EL1, RAS, IMP),
REG_FEAT(VNCR_EL2, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY),
REG_FEAT(CNTHV_CTL_EL2, ID_AA64MMFR1_EL1, VH, IMP),
REG_FEAT(CNTHV_CVAL_EL2,ID_AA64MMFR1_EL1, VH, IMP),
+ REG_FEAT(ZCR_EL2, ID_AA64PFR0_EL1, SVE, IMP),
};
bool filter_reg(__u64 reg)
@@ -718,6 +720,7 @@ static __u64 el2_regs[] = {
SYS_REG(VMPIDR_EL2),
SYS_REG(SCTLR_EL2),
SYS_REG(ACTLR_EL2),
+ SYS_REG(SCTLR2_EL2),
SYS_REG(HCR_EL2),
SYS_REG(MDCR_EL2),
SYS_REG(CPTR_EL2),
diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c
index 5e24f77868b5..c4815d365816 100644
--- a/tools/testing/selftests/kvm/arm64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c
@@ -268,7 +268,9 @@ static void guest_code(void)
/* Return a safe value to a given ftr_bits an ftr value */
uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
{
- uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
+ uint64_t ftr_max = ftr_bits->mask >> ftr_bits->shift;
+
+ TEST_ASSERT(ftr_max > 1, "This test doesn't support single bit features");
if (ftr_bits->sign == FTR_UNSIGNED) {
switch (ftr_bits->type) {
@@ -320,7 +322,9 @@ uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
/* Return an invalid value to a given ftr_bits an ftr value */
uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
{
- uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
+ uint64_t ftr_max = ftr_bits->mask >> ftr_bits->shift;
+
+ TEST_ASSERT(ftr_max > 1, "This test doesn't support single bit features");
if (ftr_bits->sign == FTR_UNSIGNED) {
switch (ftr_bits->type) {
@@ -672,7 +676,7 @@ static void test_clidr(struct kvm_vcpu *vcpu)
clidr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1));
/* find the first empty level in the cache hierarchy */
- for (level = 1; level < 7; level++) {
+ for (level = 1; level <= 7; level++) {
if (!CLIDR_CTYPE(clidr, level))
break;
}
diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
index 09f270545646..0e2f8ed90f30 100644
--- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
+++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
@@ -15,6 +15,8 @@
#include "gic_v3.h"
#include "processor.h"
+#define GITS_COLLECTION_TARGET_SHIFT 16
+
static u64 its_read_u64(unsigned long offset)
{
return readq_relaxed(GITS_BASE_GVA + offset);
@@ -163,6 +165,11 @@ static void its_encode_collection(struct its_cmd_block *cmd, u16 col)
its_mask_encode(&cmd->raw_cmd[2], col, 15, 0);
}
+static u64 procnum_to_rdbase(u32 vcpu_id)
+{
+ return vcpu_id << GITS_COLLECTION_TARGET_SHIFT;
+}
+
#define GITS_CMDQ_POLL_ITERATIONS 0
static void its_send_cmd(void *cmdq_base, struct its_cmd_block *cmd)
@@ -217,7 +224,7 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val
its_encode_cmd(&cmd, GITS_CMD_MAPC);
its_encode_collection(&cmd, collection_id);
- its_encode_target(&cmd, vcpu_id);
+ its_encode_target(&cmd, procnum_to_rdbase(vcpu_id));
its_encode_valid(&cmd, valid);
its_send_cmd(cmdq_base, &cmd);
diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh
index 46991a029f7c..8ec0cb64ad94 100644
--- a/tools/testing/selftests/livepatch/functions.sh
+++ b/tools/testing/selftests/livepatch/functions.sh
@@ -10,7 +10,11 @@ SYSFS_KERNEL_DIR="/sys/kernel"
SYSFS_KLP_DIR="$SYSFS_KERNEL_DIR/livepatch"
SYSFS_DEBUG_DIR="$SYSFS_KERNEL_DIR/debug"
SYSFS_KPROBES_DIR="$SYSFS_DEBUG_DIR/kprobes"
-SYSFS_TRACING_DIR="$SYSFS_DEBUG_DIR/tracing"
+if [[ -e /sys/kernel/tracing/trace ]]; then
+ SYSFS_TRACING_DIR="$SYSFS_KERNEL_DIR/tracing"
+else
+ SYSFS_TRACING_DIR="$SYSFS_DEBUG_DIR/tracing"
+fi
# Kselftest framework requirement - SKIP code is 4
ksft_skip=4
diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
index 9e3be2ee7f1b..f917b4c4c943 100644
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -1758,10 +1758,15 @@ int main(int argc, char *argv[])
uffd_test_ops = mem_type->mem_ops;
uffd_test_case_ops = test->test_case_ops;
- if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB))
+ if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) {
gopts.page_size = default_huge_page_size();
- else
+ if (gopts.page_size == 0) {
+ uffd_test_skip("huge page size is 0, feature missing?");
+ continue;
+ }
+ } else {
gopts.page_size = psize();
+ }
/* Ensure we have at least 2 pages */
gopts.nr_pages = MAX(UFFD_TEST_MEM_SIZE, gopts.page_size * 2)
@@ -1776,12 +1781,6 @@ int main(int argc, char *argv[])
continue;
uffd_test_start("%s on %s", test->name, mem_type->name);
- if ((mem_type->mem_flag == MEM_HUGETLB ||
- mem_type->mem_flag == MEM_HUGETLB_PRIVATE) &&
- (default_huge_page_size() == 0)) {
- uffd_test_skip("huge page size is 0, feature missing?");
- continue;
- }
if (!uffd_feature_supported(test)) {
uffd_test_skip("feature missing");
continue;
diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore
index ccfb40837a73..0989e80da457 100644
--- a/tools/testing/selftests/namespaces/.gitignore
+++ b/tools/testing/selftests/namespaces/.gitignore
@@ -1,3 +1,12 @@
nsid_test
file_handle_test
init_ino_test
+ns_active_ref_test
+listns_test
+listns_permissions_test
+listns_efault_test
+siocgskns_test
+cred_change_test
+stress_test
+listns_pagination_bug
+regression_pidfd_setns_test
diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile
index 5fe4b3dc07d3..fbb821652c17 100644
--- a/tools/testing/selftests/namespaces/Makefile
+++ b/tools/testing/selftests/namespaces/Makefile
@@ -1,7 +1,29 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS += -lcap
-TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test
+TEST_GEN_PROGS := nsid_test \
+ file_handle_test \
+ init_ino_test \
+ ns_active_ref_test \
+ listns_test \
+ listns_permissions_test \
+ listns_efault_test \
+ siocgskns_test \
+ cred_change_test \
+ stress_test \
+ listns_pagination_bug \
+ regression_pidfd_setns_test
include ../lib.mk
+$(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c
+$(OUTPUT)/listns_test: ../filesystems/utils.c
+$(OUTPUT)/listns_permissions_test: ../filesystems/utils.c
+$(OUTPUT)/listns_efault_test: ../filesystems/utils.c
+$(OUTPUT)/siocgskns_test: ../filesystems/utils.c
+$(OUTPUT)/cred_change_test: ../filesystems/utils.c
+$(OUTPUT)/stress_test: ../filesystems/utils.c
+$(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c
+$(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c
+
diff --git a/tools/testing/selftests/namespaces/cred_change_test.c b/tools/testing/selftests/namespaces/cred_change_test.c
new file mode 100644
index 000000000000..7b4f5ad3f725
--- /dev/null
+++ b/tools/testing/selftests/namespaces/cred_change_test.c
@@ -0,0 +1,814 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/capability.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/nsfs.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Test credential changes and their impact on namespace active references.
+ */
+
+/*
+ * Test setuid() in a user namespace properly swaps active references.
+ * Create a user namespace with multiple UIDs mapped, then setuid() between them.
+ * Verify that the user namespace remains active throughout.
+ */
+TEST(setuid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int setuid_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace with multiple UIDs mapped (0-9) */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send namespace ID to parent */
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /*
+ * Perform multiple setuid() calls.
+ * Each setuid() triggers commit_creds() which should properly
+ * swap active references via switch_cred_namespaces().
+ */
+ for (setuid_count = 0; setuid_count < 50; setuid_count++) {
+ uid_t target_uid = (setuid_count % 10);
+ if (setuid(target_uid) < 0) {
+ if (errno != EPERM) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ TH_LOG("Child user namespace ID: %llu", (unsigned long long)userns_id);
+
+ /* Verify namespace is active while child is running */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+ ASSERT_TRUE(found);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive after child exits */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setuid() correctly preserved active references (no leak)");
+}
+
+/*
+ * Test setgid() in a user namespace properly handles active references.
+ */
+TEST(setgid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int setgid_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace with multiple GIDs mapped */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /* Perform multiple setgid() calls */
+ for (setgid_count = 0; setgid_count < 50; setgid_count++) {
+ gid_t target_gid = (setgid_count % 10);
+ if (setgid(target_gid) < 0) {
+ if (errno != EPERM) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setgid() correctly preserved active references (no leak)");
+}
+
+/*
+ * Test setresuid() which changes real, effective, and saved UIDs.
+ * This should properly swap active references via commit_creds().
+ */
+TEST(setresuid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int setres_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /* Perform multiple setresuid() calls */
+ for (setres_count = 0; setres_count < 30; setres_count++) {
+ uid_t uid1 = (setres_count % 5);
+ uid_t uid2 = ((setres_count + 1) % 5);
+ uid_t uid3 = ((setres_count + 2) % 5);
+
+ if (setresuid(uid1, uid2, uid3) < 0) {
+ if (errno != EPERM) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setresuid() correctly preserved active references (no leak)");
+}
+
+/*
+ * Test credential changes across multiple user namespaces.
+ * Create nested user namespaces and verify active reference tracking.
+ */
+TEST(cred_change_nested_userns)
+{
+ pid_t pid;
+ int status;
+ __u64 parent_userns_id, child_userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found_parent = false, found_child = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 parent_id, child_id;
+ uid_t orig_uid = getuid();
+
+ close(pipefd[0]);
+
+ /* Create first user namespace */
+ userns_fd = get_userns_fd(0, orig_uid, 1);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get first namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &parent_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create nested user namespace */
+ userns_fd = get_userns_fd(0, 0, 1);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get nested namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send both IDs to parent */
+ write(pipefd[1], &parent_id, sizeof(parent_id));
+ write(pipefd[1], &child_id, sizeof(child_id));
+
+ /* Perform some credential changes in nested namespace */
+ setuid(0);
+ setgid(0);
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ /* Read both namespace IDs */
+ if (read(pipefd[0], &parent_userns_id, sizeof(parent_userns_id)) != sizeof(parent_userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get parent namespace ID");
+ }
+
+ if (read(pipefd[0], &child_userns_id, sizeof(child_userns_id)) != sizeof(child_userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get child namespace ID");
+ }
+ close(pipefd[0]);
+
+ TH_LOG("Parent userns: %llu, Child userns: %llu",
+ (unsigned long long)parent_userns_id,
+ (unsigned long long)child_userns_id);
+
+ /* Verify both namespaces are active */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == parent_userns_id)
+ found_parent = true;
+ if (ns_ids[i] == child_userns_id)
+ found_child = true;
+ }
+
+ ASSERT_TRUE(found_parent);
+ ASSERT_TRUE(found_child);
+
+ /* Wait for child */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify both namespaces become inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found_parent = false;
+ found_child = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == parent_userns_id)
+ found_parent = true;
+ if (ns_ids[i] == child_userns_id)
+ found_child = true;
+ }
+
+ ASSERT_FALSE(found_parent);
+ ASSERT_FALSE(found_child);
+ TH_LOG("Nested user namespace credential changes preserved active refs (no leak)");
+}
+
+/*
+ * Test rapid credential changes don't cause refcount imbalances.
+ * This stress-tests the switch_cred_namespaces() logic.
+ */
+TEST(rapid_cred_changes_no_leak)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int change_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace with wider range of UIDs/GIDs */
+ userns_fd = get_userns_fd(0, orig_uid, 100);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /*
+ * Perform many rapid credential changes.
+ * Mix setuid, setgid, setreuid, setregid, setresuid, setresgid.
+ */
+ for (change_count = 0; change_count < 200; change_count++) {
+ switch (change_count % 6) {
+ case 0:
+ setuid(change_count % 50);
+ break;
+ case 1:
+ setgid(change_count % 50);
+ break;
+ case 2:
+ setreuid(change_count % 50, (change_count + 1) % 50);
+ break;
+ case 3:
+ setregid(change_count % 50, (change_count + 1) % 50);
+ break;
+ case 4:
+ setresuid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50);
+ break;
+ case 5:
+ setresgid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50);
+ break;
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ TH_LOG("Testing with user namespace ID: %llu", (unsigned long long)userns_id);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive (no leaked active refs) */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("200 rapid credential changes completed with no active ref leak");
+}
+
+/*
+ * Test setfsuid/setfsgid which change filesystem UID/GID.
+ * These also trigger credential changes but may have different code paths.
+ */
+TEST(setfsuid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int change_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /* Perform multiple setfsuid/setfsgid calls */
+ for (change_count = 0; change_count < 50; change_count++) {
+ setfsuid(change_count % 10);
+ setfsgid(change_count % 10);
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setfsuid/setfsgid correctly preserved active references (no leak)");
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c
new file mode 100644
index 000000000000..c7ed4023d7a8
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_efault_test.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "../pidfd/pidfd.h"
+#include "wrappers.h"
+
+/*
+ * Test listns() error handling with invalid buffer addresses.
+ *
+ * When the buffer pointer is invalid (e.g., crossing page boundaries
+ * into unmapped memory), listns() returns EINVAL.
+ *
+ * This test also creates mount namespaces that get destroyed during
+ * iteration, testing that namespace cleanup happens outside the RCU
+ * read lock.
+ */
+TEST(listns_partial_fault_with_ns_cleanup)
+{
+ void *map;
+ __u64 *ns_ids;
+ ssize_t ret;
+ long page_size;
+ pid_t pid, iter_pid;
+ int pidfds[5];
+ int sv[5][2];
+ int iter_pidfd;
+ int i, status;
+ char c;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ ASSERT_GT(page_size, 0);
+
+ /*
+ * Map two pages:
+ * - First page: readable and writable
+ * - Second page: will be unmapped to trigger EFAULT
+ */
+ map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ /* Unmap the second page */
+ ret = munmap((char *)map + page_size, page_size);
+ ASSERT_EQ(ret, 0);
+
+ /*
+ * Position the buffer pointer so there's room for exactly one u64
+ * before the page boundary. The second u64 would fall into the
+ * unmapped page.
+ */
+ ns_ids = ((__u64 *)((char *)map + page_size)) - 1;
+
+ /*
+ * Create a separate process to run listns() in a loop concurrently
+ * with namespace creation and destruction.
+ */
+ iter_pid = create_child(&iter_pidfd, 0);
+ ASSERT_NE(iter_pid, -1);
+
+ if (iter_pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0, /* Global listing */
+ };
+ int iter_ret;
+
+ /*
+ * Loop calling listns() until killed.
+ * The kernel should:
+ * 1. Successfully write the first namespace ID (within valid page)
+ * 2. Fail with EFAULT when trying to write the second ID (unmapped page)
+ * 3. Handle concurrent namespace destruction without deadlock
+ */
+ while (1) {
+ iter_ret = sys_listns(&req, ns_ids, 2, 0);
+
+ if (iter_ret == -1 && errno == ENOSYS)
+ _exit(PIDFD_SKIP);
+ }
+ }
+
+ /* Small delay to let iterator start looping */
+ usleep(50000);
+
+ /*
+ * Create several child processes, each in its own mount namespace.
+ * These will be destroyed while the iterator is running listns().
+ */
+ for (i = 0; i < 5; i++) {
+ /* Create socketpair for synchronization */
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+ pid = create_child(&pidfds[i], CLONE_NEWNS);
+ ASSERT_NE(pid, -1);
+
+ if (pid == 0) {
+ close(sv[i][0]); /* Close parent end */
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ _exit(1);
+
+ /* Child: create a couple of tmpfs mounts */
+ if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+ if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+
+ if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+ if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+
+ /* Signal parent that setup is complete */
+ if (write_nointr(sv[i][1], "R", 1) != 1)
+ _exit(1);
+
+ /* Wait for parent to signal us to exit */
+ if (read_nointr(sv[i][1], &c, 1) != 1)
+ _exit(1);
+
+ close(sv[i][1]);
+ _exit(0);
+ }
+
+ close(sv[i][1]); /* Close child end */
+ }
+
+ /* Wait for all children to finish setup */
+ for (i = 0; i < 5; i++) {
+ ret = read_nointr(sv[i][0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ ASSERT_EQ(c, 'R');
+ }
+
+ /*
+ * Signal children to exit. This will destroy their mount namespaces
+ * while listns() is iterating the namespace tree.
+ * This tests that cleanup happens outside the RCU read lock.
+ */
+ for (i = 0; i < 5; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Wait for all mount namespace children to exit and cleanup */
+ for (i = 0; i < 5; i++) {
+ waitpid(-1, NULL, 0);
+ close(sv[i][0]);
+ close(pidfds[i]);
+ }
+
+ /* Kill iterator and wait for it */
+ sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+ ret = waitpid(iter_pid, &status, 0);
+ ASSERT_EQ(ret, iter_pid);
+ close(iter_pidfd);
+
+ /* Should have been killed */
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ /* Clean up */
+ munmap(map, page_size);
+}
+
+/*
+ * Test listns() error handling when the entire buffer is invalid.
+ * This is a sanity check that basic invalid pointer detection works.
+ */
+TEST(listns_complete_fault)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 *ns_ids;
+ ssize_t ret;
+
+ /* Use a clearly invalid pointer */
+ ns_ids = (__u64 *)0xdeadbeef;
+
+ ret = sys_listns(&req, ns_ids, 10, 0);
+
+ if (ret == -1 && errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+
+ /* Should fail with EFAULT */
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() error handling when the buffer is NULL.
+ */
+TEST(listns_null_buffer)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ ssize_t ret;
+
+ /* NULL buffer with non-zero count should fail */
+ ret = sys_listns(&req, NULL, 10, 0);
+
+ if (ret == -1 && errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+
+ /* Should fail with EFAULT */
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() with a buffer that becomes invalid mid-iteration
+ * (after several successful writes), combined with mount namespace
+ * destruction to test RCU cleanup logic.
+ */
+TEST(listns_late_fault_with_ns_cleanup)
+{
+ void *map;
+ __u64 *ns_ids;
+ ssize_t ret;
+ long page_size;
+ pid_t pid, iter_pid;
+ int pidfds[10];
+ int sv[10][2];
+ int iter_pidfd;
+ int i, status;
+ char c;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ ASSERT_GT(page_size, 0);
+
+ /* Map two pages */
+ map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ /* Unmap the second page */
+ ret = munmap((char *)map + page_size, page_size);
+ ASSERT_EQ(ret, 0);
+
+ /*
+ * Position buffer so we can write several u64s successfully
+ * before hitting the page boundary.
+ */
+ ns_ids = ((__u64 *)((char *)map + page_size)) - 5;
+
+ /*
+ * Create a separate process to run listns() concurrently.
+ */
+ iter_pid = create_child(&iter_pidfd, 0);
+ ASSERT_NE(iter_pid, -1);
+
+ if (iter_pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ int iter_ret;
+
+ /*
+ * Loop calling listns() until killed.
+ * Request 10 namespace IDs while namespaces are being destroyed.
+ * This tests:
+ * 1. EFAULT handling when buffer becomes invalid
+ * 2. Namespace cleanup outside RCU read lock during iteration
+ */
+ while (1) {
+ iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+ if (iter_ret == -1 && errno == ENOSYS)
+ _exit(PIDFD_SKIP);
+ }
+ }
+
+ /* Small delay to let iterator start looping */
+ usleep(50000);
+
+ /*
+ * Create more children with mount namespaces to increase the
+ * likelihood that namespace cleanup happens during iteration.
+ */
+ for (i = 0; i < 10; i++) {
+ /* Create socketpair for synchronization */
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+ pid = create_child(&pidfds[i], CLONE_NEWNS);
+ ASSERT_NE(pid, -1);
+
+ if (pid == 0) {
+ close(sv[i][0]); /* Close parent end */
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ _exit(1);
+
+ /* Child: create tmpfs mounts */
+ if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+ if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+
+ if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+ if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+
+ /* Signal parent that setup is complete */
+ if (write_nointr(sv[i][1], "R", 1) != 1)
+ _exit(1);
+
+ /* Wait for parent to signal us to exit */
+ if (read_nointr(sv[i][1], &c, 1) != 1)
+ _exit(1);
+
+ close(sv[i][1]);
+ _exit(0);
+ }
+
+ close(sv[i][1]); /* Close child end */
+ }
+
+ /* Wait for all children to finish setup */
+ for (i = 0; i < 10; i++) {
+ ret = read_nointr(sv[i][0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ ASSERT_EQ(c, 'R');
+ }
+
+ /* Kill half the children */
+ for (i = 0; i < 5; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Small delay to let some exit */
+ usleep(10000);
+
+ /* Kill remaining children */
+ for (i = 5; i < 10; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Wait for all children and cleanup */
+ for (i = 0; i < 10; i++) {
+ waitpid(-1, NULL, 0);
+ close(sv[i][0]);
+ close(pidfds[i]);
+ }
+
+ /* Kill iterator and wait for it */
+ sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+ ret = waitpid(iter_pid, &status, 0);
+ ASSERT_EQ(ret, iter_pid);
+ close(iter_pidfd);
+
+ /* Should have been killed */
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ /* Clean up */
+ munmap(map, page_size);
+}
+
+/*
+ * Test specifically focused on mount namespace cleanup during EFAULT.
+ * Filter for mount namespaces only.
+ */
+TEST(listns_mnt_ns_cleanup_on_fault)
+{
+ void *map;
+ __u64 *ns_ids;
+ ssize_t ret;
+ long page_size;
+ pid_t pid, iter_pid;
+ int pidfds[8];
+ int sv[8][2];
+ int iter_pidfd;
+ int i, status;
+ char c;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ ASSERT_GT(page_size, 0);
+
+ /* Set up partial fault buffer */
+ map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ ret = munmap((char *)map + page_size, page_size);
+ ASSERT_EQ(ret, 0);
+
+ /* Position for 3 successful writes, then fault */
+ ns_ids = ((__u64 *)((char *)map + page_size)) - 3;
+
+ /*
+ * Create a separate process to run listns() concurrently.
+ */
+ iter_pid = create_child(&iter_pidfd, 0);
+ ASSERT_NE(iter_pid, -1);
+
+ if (iter_pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNS, /* Only mount namespaces */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ int iter_ret;
+
+ /*
+ * Loop calling listns() until killed.
+ * Call listns() to race with namespace destruction.
+ */
+ while (1) {
+ iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+ if (iter_ret == -1 && errno == ENOSYS)
+ _exit(PIDFD_SKIP);
+ }
+ }
+
+ /* Small delay to let iterator start looping */
+ usleep(50000);
+
+ /* Create children with mount namespaces */
+ for (i = 0; i < 8; i++) {
+ /* Create socketpair for synchronization */
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+ pid = create_child(&pidfds[i], CLONE_NEWNS);
+ ASSERT_NE(pid, -1);
+
+ if (pid == 0) {
+ close(sv[i][0]); /* Close parent end */
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ _exit(1);
+
+ /* Do some mount operations to make cleanup more interesting */
+ if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+ if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+
+ if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+ if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+
+ /* Signal parent that setup is complete */
+ if (write_nointr(sv[i][1], "R", 1) != 1)
+ _exit(1);
+
+ /* Wait for parent to signal us to exit */
+ if (read_nointr(sv[i][1], &c, 1) != 1)
+ _exit(1);
+
+ close(sv[i][1]);
+ _exit(0);
+ }
+
+ close(sv[i][1]); /* Close child end */
+ }
+
+ /* Wait for all children to finish setup */
+ for (i = 0; i < 8; i++) {
+ ret = read_nointr(sv[i][0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ ASSERT_EQ(c, 'R');
+ }
+
+ /* Kill children to trigger namespace destruction during iteration */
+ for (i = 0; i < 8; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Wait for children and cleanup */
+ for (i = 0; i < 8; i++) {
+ waitpid(-1, NULL, 0);
+ close(sv[i][0]);
+ close(pidfds[i]);
+ }
+
+ /* Kill iterator and wait for it */
+ sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+ ret = waitpid(iter_pid, &status, 0);
+ ASSERT_EQ(ret, iter_pid);
+ close(iter_pidfd);
+
+ /* Should have been killed */
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ munmap(map, page_size);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_pagination_bug.c b/tools/testing/selftests/namespaces/listns_pagination_bug.c
new file mode 100644
index 000000000000..da7d33f96397
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_pagination_bug.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Minimal test case to reproduce KASAN out-of-bounds in listns pagination.
+ *
+ * The bug occurs when:
+ * 1. Filtering by a specific namespace type (e.g., CLONE_NEWUSER)
+ * 2. Using pagination (req.ns_id != 0)
+ * 3. The lookup_ns_id_at() call in do_listns() passes ns_type=0 instead of
+ * the filtered type, causing it to search the unified tree and potentially
+ * return a namespace of the wrong type.
+ */
+TEST(pagination_with_type_filter)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER, /* Filter by user namespace */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ pid_t pids[10];
+ int num_children = 10;
+ int i;
+ int sv[2];
+ __u64 first_batch[3];
+ ssize_t ret;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create children with user namespaces */
+ for (i = 0; i < num_children; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ char c;
+ close(sv[0]);
+
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Signal parent we're ready */
+ if (write(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal to exit */
+ if (read(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ close(sv[1]);
+ exit(0);
+ }
+ }
+
+ close(sv[1]);
+
+ /* Wait for all children to signal ready */
+ for (i = 0; i < num_children; i++) {
+ char c;
+ if (read(sv[0], &c, 1) != 1) {
+ close(sv[0]);
+ for (int j = 0; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ /* First batch - this should work */
+ ret = sys_listns(&req, first_batch, 3, 0);
+ if (ret < 0) {
+ if (errno == ENOSYS) {
+ close(sv[0]);
+ for (i = 0; i < num_children; i++)
+ kill(pids[i], SIGKILL);
+ for (i = 0; i < num_children; i++)
+ waitpid(pids[i], NULL, 0);
+ SKIP(return, "listns() not supported");
+ }
+ ASSERT_GE(ret, 0);
+ }
+
+ TH_LOG("First batch returned %zd entries", ret);
+
+ if (ret == 3) {
+ __u64 second_batch[3];
+
+ /* Second batch - pagination triggers the bug */
+ req.ns_id = first_batch[2]; /* Continue from last ID */
+ ret = sys_listns(&req, second_batch, 3, 0);
+
+ TH_LOG("Second batch returned %zd entries", ret);
+ ASSERT_GE(ret, 0);
+ }
+
+ /* Signal all children to exit */
+ for (i = 0; i < num_children; i++) {
+ char c = 'X';
+ if (write(sv[0], &c, 1) != 1) {
+ close(sv[0]);
+ for (int j = i; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ close(sv[0]);
+
+ /* Cleanup */
+ for (i = 0; i < num_children; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_permissions_test.c b/tools/testing/selftests/namespaces/listns_permissions_test.c
new file mode 100644
index 000000000000..82d818751a5f
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_permissions_test.c
@@ -0,0 +1,759 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/capability.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Test that unprivileged users can only see namespaces they're currently in.
+ * Create a namespace, drop privileges, verify we can only see our own namespaces.
+ */
+TEST(listns_unprivileged_current_only)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool found_ours;
+ int unexpected_count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 our_netns_id;
+ bool found_ours;
+ int unexpected_count;
+
+ close(pipefd[0]);
+
+ /* Create user namespace to be unprivileged */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Create a network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get our network namespace ID */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &our_netns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Now we're unprivileged - list all network namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* We should only see our own network namespace */
+ found_ours = false;
+ unexpected_count = 0;
+
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == our_netns_id) {
+ found_ours = true;
+ } else {
+ /* This is either init_net (which we can see) or unexpected */
+ unexpected_count++;
+ }
+ }
+
+ /* Send results to parent */
+ write(pipefd[1], &found_ours, sizeof(found_ours));
+ write(pipefd[1], &unexpected_count, sizeof(unexpected_count));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ found_ours = false;
+ unexpected_count = 0;
+ read(pipefd[0], &found_ours, sizeof(found_ours));
+ read(pipefd[0], &unexpected_count, sizeof(unexpected_count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Child should have seen its own namespace */
+ ASSERT_TRUE(found_ours);
+
+ TH_LOG("Unprivileged child saw its own namespace, plus %d others (likely init_net)",
+ unexpected_count);
+}
+
+/*
+ * Test that users with CAP_SYS_ADMIN in a user namespace can see
+ * all namespaces owned by that user namespace.
+ */
+TEST(listns_cap_sys_admin_in_userns)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0, /* Will be set to our created user namespace */
+ };
+ __u64 ns_ids[100];
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool success;
+ ssize_t count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 userns_id;
+ ssize_t ret;
+ int min_expected;
+ bool success;
+
+ close(pipefd[0]);
+
+ /* Create user namespace - we'll have CAP_SYS_ADMIN in it */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get the user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create several namespaces owned by this user namespace */
+ unshare(CLONE_NEWNET);
+ unshare(CLONE_NEWUTS);
+ unshare(CLONE_NEWIPC);
+
+ /* List namespaces owned by our user namespace */
+ req.user_ns_id = userns_id;
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /*
+ * We have CAP_SYS_ADMIN in this user namespace,
+ * so we should see all namespaces owned by it.
+ * That includes: net, uts, ipc, and the user namespace itself.
+ */
+ min_expected = 4;
+ success = (ret >= min_expected);
+
+ write(pipefd[1], &success, sizeof(success));
+ write(pipefd[1], &ret, sizeof(ret));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ success = false;
+ count = 0;
+ read(pipefd[0], &success, sizeof(success));
+ read(pipefd[0], &count, sizeof(count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(success);
+ TH_LOG("User with CAP_SYS_ADMIN saw %zd namespaces owned by their user namespace",
+ count);
+}
+
+/*
+ * Test that users cannot see namespaces from unrelated user namespaces.
+ * Create two sibling user namespaces, verify they can't see each other's
+ * owned namespaces.
+ */
+TEST(listns_cannot_see_sibling_userns_namespaces)
+{
+ int pipefd[2];
+ pid_t pid1, pid2;
+ int status;
+ __u64 netns_a_id;
+ int pipefd2[2];
+ bool found_sibling_netns;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ /* Fork first child - creates user namespace A */
+ pid1 = fork();
+ ASSERT_GE(pid1, 0);
+
+ if (pid1 == 0) {
+ int fd;
+ __u64 netns_a_id;
+ char buf;
+
+ close(pipefd[0]);
+
+ /* Create user namespace A */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Create network namespace owned by user namespace A */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get network namespace ID */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &netns_a_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send namespace ID to parent */
+ write(pipefd[1], &netns_a_id, sizeof(netns_a_id));
+
+ /* Keep alive for sibling to check */
+ read(pipefd[1], &buf, 1);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent reads namespace A ID */
+ close(pipefd[1]);
+ netns_a_id = 0;
+ read(pipefd[0], &netns_a_id, sizeof(netns_a_id));
+
+ TH_LOG("User namespace A created network namespace with ID %llu",
+ (unsigned long long)netns_a_id);
+
+ /* Fork second child - creates user namespace B */
+ ASSERT_EQ(pipe(pipefd2), 0);
+
+ pid2 = fork();
+ ASSERT_GE(pid2, 0);
+
+ if (pid2 == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_sibling_netns;
+
+ close(pipefd[0]);
+ close(pipefd2[0]);
+
+ /* Create user namespace B (sibling to A) */
+ if (setup_userns() < 0) {
+ close(pipefd2[1]);
+ exit(1);
+ }
+
+ /* Try to list all network namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ found_sibling_netns = false;
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_a_id) {
+ found_sibling_netns = true;
+ break;
+ }
+ }
+ }
+
+ /* We should NOT see the sibling's network namespace */
+ write(pipefd2[1], &found_sibling_netns, sizeof(found_sibling_netns));
+ close(pipefd2[1]);
+ exit(0);
+ }
+
+ /* Parent reads result from second child */
+ close(pipefd2[1]);
+ found_sibling_netns = false;
+ read(pipefd2[0], &found_sibling_netns, sizeof(found_sibling_netns));
+ close(pipefd2[0]);
+
+ /* Signal first child to exit */
+ close(pipefd[0]);
+
+ /* Wait for both children */
+ waitpid(pid2, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ waitpid(pid1, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ /* Second child should NOT have seen first child's namespace */
+ ASSERT_FALSE(found_sibling_netns);
+ TH_LOG("User namespace B correctly could not see sibling namespace A's network namespace");
+}
+
+/*
+ * Test permission checking with LISTNS_CURRENT_USER.
+ * Verify that listing with LISTNS_CURRENT_USER respects permissions.
+ */
+TEST(listns_current_user_permissions)
+{
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool success;
+ ssize_t count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = LISTNS_CURRENT_USER,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool success;
+
+ close(pipefd[0]);
+
+ /* Create user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Create some namespaces owned by this user namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (unshare(CLONE_NEWUTS) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* List with LISTNS_CURRENT_USER - should see our owned namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ success = (ret >= 3); /* At least user, net, uts */
+ write(pipefd[1], &success, sizeof(success));
+ write(pipefd[1], &ret, sizeof(ret));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ success = false;
+ count = 0;
+ read(pipefd[0], &success, sizeof(success));
+ read(pipefd[0], &count, sizeof(count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(success);
+ TH_LOG("LISTNS_CURRENT_USER returned %zd namespaces", count);
+}
+
+/*
+ * Test that CAP_SYS_ADMIN in parent user namespace allows seeing
+ * child user namespace's owned namespaces.
+ */
+TEST(listns_parent_userns_cap_sys_admin)
+{
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool found_child_userns;
+ ssize_t count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 parent_userns_id;
+ __u64 child_userns_id;
+ struct ns_id_req req;
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_child_userns;
+
+ close(pipefd[0]);
+
+ /* Create parent user namespace - we have CAP_SYS_ADMIN in it */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get parent user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &parent_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create child user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get child user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create namespaces owned by child user namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* List namespaces owned by parent user namespace */
+ req.size = sizeof(req);
+ req.spare = 0;
+ req.ns_id = 0;
+ req.ns_type = 0;
+ req.spare2 = 0;
+ req.user_ns_id = parent_userns_id;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ /* Should see child user namespace in the list */
+ found_child_userns = false;
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == child_userns_id) {
+ found_child_userns = true;
+ break;
+ }
+ }
+ }
+
+ write(pipefd[1], &found_child_userns, sizeof(found_child_userns));
+ write(pipefd[1], &ret, sizeof(ret));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ found_child_userns = false;
+ count = 0;
+ read(pipefd[0], &found_child_userns, sizeof(found_child_userns));
+ read(pipefd[0], &count, sizeof(count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(found_child_userns);
+ TH_LOG("Process with CAP_SYS_ADMIN in parent user namespace saw child user namespace (total: %zd)",
+ count);
+}
+
+/*
+ * Test that we can see user namespaces we have CAP_SYS_ADMIN inside of.
+ * This is different from seeing namespaces owned by a user namespace.
+ */
+TEST(listns_cap_sys_admin_inside_userns)
+{
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool found_ours;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 our_userns_id;
+ struct ns_id_req req;
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_ours;
+
+ close(pipefd[0]);
+
+ /* Create user namespace - we have CAP_SYS_ADMIN inside it */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get our user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &our_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* List all user namespaces globally */
+ req.size = sizeof(req);
+ req.spare = 0;
+ req.ns_id = 0;
+ req.ns_type = CLONE_NEWUSER;
+ req.spare2 = 0;
+ req.user_ns_id = 0;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ /* We should be able to see our own user namespace */
+ found_ours = false;
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == our_userns_id) {
+ found_ours = true;
+ break;
+ }
+ }
+ }
+
+ write(pipefd[1], &found_ours, sizeof(found_ours));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ found_ours = false;
+ read(pipefd[0], &found_ours, sizeof(found_ours));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(found_ours);
+ TH_LOG("Process can see user namespace it has CAP_SYS_ADMIN inside of");
+}
+
+/*
+ * Test that dropping CAP_SYS_ADMIN restricts what we can see.
+ */
+TEST(listns_drop_cap_sys_admin)
+{
+ cap_t caps;
+ cap_value_t cap_list[1] = { CAP_SYS_ADMIN };
+
+ /* This test needs to start with CAP_SYS_ADMIN */
+ caps = cap_get_proc();
+ if (!caps) {
+ SKIP(return, "Cannot get capabilities");
+ }
+
+ cap_flag_value_t cap_val;
+ if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &cap_val) < 0) {
+ cap_free(caps);
+ SKIP(return, "Cannot check CAP_SYS_ADMIN");
+ }
+
+ if (cap_val != CAP_SET) {
+ cap_free(caps);
+ SKIP(return, "Test needs CAP_SYS_ADMIN to start");
+ }
+ cap_free(caps);
+
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool correct;
+ ssize_t count_before, count_after;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = LISTNS_CURRENT_USER,
+ };
+ __u64 ns_ids_before[100];
+ ssize_t count_before;
+ __u64 ns_ids_after[100];
+ ssize_t count_after;
+ bool correct;
+
+ close(pipefd[0]);
+
+ /* Create user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Count namespaces with CAP_SYS_ADMIN */
+ count_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+
+ /* Drop CAP_SYS_ADMIN */
+ caps = cap_get_proc();
+ if (caps) {
+ cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR);
+ cap_set_flag(caps, CAP_PERMITTED, 1, cap_list, CAP_CLEAR);
+ cap_set_proc(caps);
+ cap_free(caps);
+ }
+
+ /* Ensure we can't regain the capability */
+ prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+
+ /* Count namespaces without CAP_SYS_ADMIN */
+ count_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+
+ /* Without CAP_SYS_ADMIN, we should see same or fewer namespaces */
+ correct = (count_after <= count_before);
+
+ write(pipefd[1], &correct, sizeof(correct));
+ write(pipefd[1], &count_before, sizeof(count_before));
+ write(pipefd[1], &count_after, sizeof(count_after));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ correct = false;
+ count_before = 0;
+ count_after = 0;
+ read(pipefd[0], &correct, sizeof(correct));
+ read(pipefd[0], &count_before, sizeof(count_before));
+ read(pipefd[0], &count_after, sizeof(count_after));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(correct);
+ TH_LOG("With CAP_SYS_ADMIN: %zd namespaces, without: %zd namespaces",
+ count_before, count_after);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_test.c b/tools/testing/selftests/namespaces/listns_test.c
new file mode 100644
index 000000000000..8a95789d6a87
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_test.c
@@ -0,0 +1,679 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Test basic listns() functionality with the unified namespace tree.
+ * List all active namespaces globally.
+ */
+TEST(listns_basic_unified)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0, /* Global listing */
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+
+ /* Should find at least the initial namespaces */
+ ASSERT_GT(ret, 0);
+ TH_LOG("Found %zd active namespaces", ret);
+
+ /* Verify all returned IDs are non-zero */
+ for (ssize_t i = 0; i < ret; i++) {
+ ASSERT_NE(ns_ids[i], 0);
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+ }
+}
+
+/*
+ * Test listns() with type filtering.
+ * List only network namespaces.
+ */
+TEST(listns_filter_by_type)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET, /* Only network namespaces */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret, 0);
+
+ /* Should find at least init_net */
+ ASSERT_GT(ret, 0);
+ TH_LOG("Found %zd active network namespaces", ret);
+
+ /* Verify we can open each namespace and it's actually a network namespace */
+ for (ssize_t i = 0; i < ret && i < 5; i++) {
+ struct nsfs_file_handle nsfh = {
+ .ns_id = ns_ids[i],
+ .ns_type = CLONE_NEWNET,
+ .ns_inum = 0,
+ };
+ struct file_handle *fh;
+ int fd;
+
+ fh = (struct file_handle *)malloc(sizeof(*fh) + sizeof(nsfh));
+ ASSERT_NE(fh, NULL);
+ fh->handle_bytes = sizeof(nsfh);
+ fh->handle_type = 0;
+ memcpy(fh->f_handle, &nsfh, sizeof(nsfh));
+
+ fd = open_by_handle_at(-10003, fh, O_RDONLY);
+ free(fh);
+
+ if (fd >= 0) {
+ int ns_type;
+ /* Verify it's a network namespace via ioctl */
+ ns_type = ioctl(fd, NS_GET_NSTYPE);
+ if (ns_type >= 0) {
+ ASSERT_EQ(ns_type, CLONE_NEWNET);
+ }
+ close(fd);
+ }
+ }
+}
+
+/*
+ * Test listns() pagination.
+ * List namespaces in batches.
+ */
+TEST(listns_pagination)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 batch1[2], batch2[2];
+ ssize_t ret1, ret2;
+
+ /* Get first batch */
+ ret1 = sys_listns(&req, batch1, ARRAY_SIZE(batch1), 0);
+ if (ret1 < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret1, 0);
+
+ if (ret1 == 0)
+ SKIP(return, "No namespaces found");
+
+ TH_LOG("First batch: %zd namespaces", ret1);
+
+ /* Get second batch using last ID from first batch */
+ if (ret1 == ARRAY_SIZE(batch1)) {
+ req.ns_id = batch1[ret1 - 1];
+ ret2 = sys_listns(&req, batch2, ARRAY_SIZE(batch2), 0);
+ ASSERT_GE(ret2, 0);
+
+ TH_LOG("Second batch: %zd namespaces (after ns_id=%llu)",
+ ret2, (unsigned long long)req.ns_id);
+
+ /* If we got more results, verify IDs are monotonically increasing */
+ if (ret2 > 0) {
+ ASSERT_GT(batch2[0], batch1[ret1 - 1]);
+ TH_LOG("Pagination working: %llu > %llu",
+ (unsigned long long)batch2[0],
+ (unsigned long long)batch1[ret1 - 1]);
+ }
+ } else {
+ TH_LOG("All namespaces fit in first batch");
+ }
+}
+
+/*
+ * Test listns() with LISTNS_CURRENT_USER.
+ * List namespaces owned by current user namespace.
+ */
+TEST(listns_current_user)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = LISTNS_CURRENT_USER,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret, 0);
+
+ /* Should find at least the initial namespaces if we're in init_user_ns */
+ TH_LOG("Found %zd namespaces owned by current user namespace", ret);
+
+ for (ssize_t i = 0; i < ret; i++)
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+}
+
+/*
+ * Test that listns() only returns active namespaces.
+ * Create a namespace, let it become inactive, verify it's not listed.
+ */
+TEST(listns_only_active)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[100], ns_ids_after[100];
+ ssize_t ret_before, ret_after;
+ int pipefd[2];
+ pid_t pid;
+ __u64 new_ns_id = 0;
+ int status;
+
+ /* Get initial list */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret_before, 0);
+
+ TH_LOG("Before: %zd active network namespaces", ret_before);
+
+ /* Create a new namespace in a child process and get its ID */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 ns_id;
+
+ close(pipefd[0]);
+
+ /* Create new network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get its ID */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &ns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send ID to parent */
+ write(pipefd[1], &ns_id, sizeof(ns_id));
+ close(pipefd[1]);
+
+ /* Keep namespace active briefly */
+ usleep(100000);
+ exit(0);
+ }
+
+ /* Parent reads the new namespace ID */
+ {
+ int bytes;
+
+ close(pipefd[1]);
+ bytes = read(pipefd[0], &new_ns_id, sizeof(new_ns_id));
+ close(pipefd[0]);
+
+ if (bytes == sizeof(new_ns_id)) {
+ __u64 ns_ids_during[100];
+ int ret_during;
+
+ TH_LOG("Child created namespace with ID %llu", (unsigned long long)new_ns_id);
+
+ /* List namespaces while child is still alive - should see new one */
+ ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0);
+ ASSERT_GE(ret_during, 0);
+ TH_LOG("During: %d active network namespaces", ret_during);
+
+ /* Should have more namespaces than before */
+ ASSERT_GE(ret_during, ret_before);
+ }
+ }
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+
+ /* Give time for namespace to become inactive */
+ usleep(100000);
+
+ /* List namespaces after child exits - should not see new one */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+ TH_LOG("After: %zd active network namespaces", ret_after);
+
+ /* Verify the new namespace ID is not in the after list */
+ if (new_ns_id != 0) {
+ bool found = false;
+
+ for (ssize_t i = 0; i < ret_after; i++) {
+ if (ns_ids_after[i] == new_ns_id) {
+ found = true;
+ break;
+ }
+ }
+ ASSERT_FALSE(found);
+ }
+}
+
+/*
+ * Test listns() with specific user namespace ID.
+ * Create a user namespace and list namespaces it owns.
+ */
+TEST(listns_specific_userns)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0, /* Will be filled with created userns ID */
+ };
+ __u64 ns_ids[100];
+ int sv[2];
+ pid_t pid;
+ int status;
+ __u64 user_ns_id = 0;
+ int bytes;
+ ssize_t ret;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 ns_id;
+ char buf;
+
+ close(sv[0]);
+
+ /* Create new user namespace */
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &ns_id) < 0) {
+ close(fd);
+ close(sv[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send ID to parent */
+ if (write(sv[1], &ns_id, sizeof(ns_id)) != sizeof(ns_id)) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Create some namespaces owned by this user namespace */
+ unshare(CLONE_NEWNET);
+ unshare(CLONE_NEWUTS);
+
+ /* Wait for parent signal */
+ if (read(sv[1], &buf, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+ close(sv[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(sv[1]);
+ bytes = read(sv[0], &user_ns_id, sizeof(user_ns_id));
+
+ if (bytes != sizeof(user_ns_id)) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get user namespace ID from child");
+ }
+
+ TH_LOG("Child created user namespace with ID %llu", (unsigned long long)user_ns_id);
+
+ /* List namespaces owned by this user namespace */
+ req.user_ns_id = user_ns_id;
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ if (ret < 0) {
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOSYS) {
+ SKIP(return, "listns() not supported");
+ }
+ ASSERT_GE(ret, 0);
+ }
+
+ TH_LOG("Found %zd namespaces owned by user namespace %llu", ret,
+ (unsigned long long)user_ns_id);
+
+ /* Should find at least the network and UTS namespaces we created */
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret && i < 10; i++)
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+ }
+
+ /* Signal child to exit */
+ if (write(sv[0], "X", 1) != 1) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ close(sv[0]);
+ waitpid(pid, &status, 0);
+}
+
+/*
+ * Test listns() with multiple namespace types filter.
+ */
+TEST(listns_multiple_types)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET | CLONE_NEWUTS, /* Network and UTS */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret, 0);
+
+ TH_LOG("Found %zd active network/UTS namespaces", ret);
+
+ for (ssize_t i = 0; i < ret; i++)
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+}
+
+/*
+ * Test that hierarchical active reference propagation keeps parent
+ * user namespaces visible in listns().
+ */
+TEST(listns_hierarchical_visibility)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 parent_ns_id = 0, child_ns_id = 0;
+ int sv[2];
+ pid_t pid;
+ int status;
+ int bytes;
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_parent, found_child;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ char buf;
+
+ close(sv[0]);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &parent_ns_id) < 0) {
+ close(fd);
+ close(sv[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create child user namespace */
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_ns_id) < 0) {
+ close(fd);
+ close(sv[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send both IDs to parent */
+ if (write(sv[1], &parent_ns_id, sizeof(parent_ns_id)) != sizeof(parent_ns_id)) {
+ close(sv[1]);
+ exit(1);
+ }
+ if (write(sv[1], &child_ns_id, sizeof(child_ns_id)) != sizeof(child_ns_id)) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal */
+ if (read(sv[1], &buf, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+ close(sv[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(sv[1]);
+
+ /* Read both namespace IDs */
+ bytes = read(sv[0], &parent_ns_id, sizeof(parent_ns_id));
+ bytes += read(sv[0], &child_ns_id, sizeof(child_ns_id));
+
+ if (bytes != (int)(2 * sizeof(__u64))) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace IDs from child");
+ }
+
+ TH_LOG("Parent user namespace ID: %llu", (unsigned long long)parent_ns_id);
+ TH_LOG("Child user namespace ID: %llu", (unsigned long long)child_ns_id);
+
+ /* List all user namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ if (ret < 0 && errno == ENOSYS) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "listns() not supported");
+ }
+
+ ASSERT_GE(ret, 0);
+ TH_LOG("Found %zd active user namespaces", ret);
+
+ /* Both parent and child should be visible (active due to child process) */
+ found_parent = false;
+ found_child = false;
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == parent_ns_id)
+ found_parent = true;
+ if (ns_ids[i] == child_ns_id)
+ found_child = true;
+ }
+
+ TH_LOG("Parent namespace %s, child namespace %s",
+ found_parent ? "found" : "NOT FOUND",
+ found_child ? "found" : "NOT FOUND");
+
+ ASSERT_TRUE(found_child);
+ /* With hierarchical propagation, parent should also be active */
+ ASSERT_TRUE(found_parent);
+
+ /* Signal child to exit */
+ if (write(sv[0], "X", 1) != 1) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ close(sv[0]);
+ waitpid(pid, &status, 0);
+}
+
+/*
+ * Test error cases for listns().
+ */
+TEST(listns_error_cases)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[10];
+ int ret;
+
+ /* Test with invalid flags */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0xFFFF);
+ if (errno == ENOSYS) {
+ /* listns() not supported, skip this check */
+ } else {
+ ASSERT_LT(ret, 0);
+ ASSERT_EQ(errno, EINVAL);
+ }
+
+ /* Test with NULL ns_ids array */
+ ret = sys_listns(&req, NULL, 10, 0);
+ ASSERT_LT(ret, 0);
+
+ /* Test with invalid spare field */
+ req.spare = 1;
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (errno == ENOSYS) {
+ /* listns() not supported, skip this check */
+ } else {
+ ASSERT_LT(ret, 0);
+ ASSERT_EQ(errno, EINVAL);
+ }
+ req.spare = 0;
+
+ /* Test with huge nr_ns_ids */
+ ret = sys_listns(&req, ns_ids, 2000000, 0);
+ if (errno == ENOSYS) {
+ /* listns() not supported, skip this check */
+ } else {
+ ASSERT_LT(ret, 0);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/ns_active_ref_test.c b/tools/testing/selftests/namespaces/ns_active_ref_test.c
new file mode 100644
index 000000000000..093268f0efaa
--- /dev/null
+++ b/tools/testing/selftests/namespaces/ns_active_ref_test.c
@@ -0,0 +1,2672 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <pthread.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+#ifndef FD_NSFS_ROOT
+#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */
+#endif
+
+#ifndef FILEID_NSFS
+#define FILEID_NSFS 0xf1
+#endif
+
+/*
+ * Test that initial namespaces can be reopened via file handle.
+ * Initial namespaces should have active ref count of 1 from boot.
+ */
+TEST(init_ns_always_active)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd1, fd2;
+ struct stat st1, st2;
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Open initial network namespace */
+ fd1 = open("/proc/1/ns/net", O_RDONLY);
+ ASSERT_GE(fd1, 0);
+
+ /* Get file handle for initial namespace */
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd1, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(fd1);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+
+ /* Close the namespace fd */
+ close(fd1);
+
+ /* Try to reopen via file handle - should succeed since init ns is always active */
+ fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd2 < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+ SKIP(free(handle);
+ return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ }
+ ASSERT_GE(fd2, 0);
+
+ /* Verify we opened the same namespace */
+ fd1 = open("/proc/1/ns/net", O_RDONLY);
+ ASSERT_GE(fd1, 0);
+ ASSERT_EQ(fstat(fd1, &st1), 0);
+ ASSERT_EQ(fstat(fd2, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ close(fd1);
+ close(fd2);
+ free(handle);
+}
+
+/*
+ * Test namespace lifecycle: create a namespace in a child process,
+ * get a file handle while it's active, then try to reopen after
+ * the process exits (namespace becomes inactive).
+ */
+TEST(ns_inactive_after_exit)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ /* Create pipe for passing file handle from child */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Open our new namespace */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get file handle for the namespace */
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+
+ /* Exit - namespace should become inactive */
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ /* Read file handle from child */
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /* Try to reopen namespace - should fail with ENOENT since it's inactive */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ /* Should fail with ENOENT (namespace inactive) or ESTALE */
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that a namespace remains active while a process is using it,
+ * even after the creating process exits.
+ */
+TEST(ns_active_with_multiple_processes)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ int syncpipe[2];
+ pid_t pid1, pid2;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+
+ /* Create pipes for communication */
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ pid1 = fork();
+ ASSERT_GE(pid1, 0);
+
+ if (pid1 == 0) {
+ /* First child - creates namespace */
+ close(pipefd[0]);
+ close(syncpipe[1]);
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ /* Open and get handle */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+
+ /* Wait for signal before exiting */
+ read(syncpipe[0], &sync_byte, 1);
+ close(syncpipe[0]);
+ exit(0);
+ }
+
+ /* Parent reads handle */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+ ASSERT_GT(ret, 0);
+
+ handle = (struct file_handle *)buf;
+
+ /* Create second child that will keep namespace active */
+ pid2 = fork();
+ ASSERT_GE(pid2, 0);
+
+ if (pid2 == 0) {
+ /* Second child - reopens the namespace */
+ close(syncpipe[0]);
+ close(syncpipe[1]);
+
+ /* Open the namespace via handle */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd < 0) {
+ exit(1);
+ }
+
+ /* Join the namespace */
+ ret = setns(fd, CLONE_NEWNET);
+ close(fd);
+ if (ret < 0) {
+ exit(1);
+ }
+
+ /* Sleep to keep namespace active */
+ sleep(1);
+ exit(0);
+ }
+
+ /* Let second child enter the namespace */
+ usleep(100000); /* 100ms */
+
+ /* Signal first child to exit */
+ close(syncpipe[0]);
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ /* Wait for first child */
+ waitpid(pid1, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ /* Namespace should still be active because second child is using it */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(fd, 0);
+ close(fd);
+
+ /* Wait for second child */
+ waitpid(pid2, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+}
+
+/*
+ * Test user namespace active ref tracking via credential lifecycle
+ */
+TEST(userns_active_ref_lifecycle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new user namespace */
+ ret = unshare(CLONE_NEWUSER);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Set up uid/gid mappings */
+ int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+ int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+ int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+ if (uid_map_fd >= 0 && gid_map_fd >= 0 && setgroups_fd >= 0) {
+ write(setgroups_fd, "deny", 4);
+ close(setgroups_fd);
+
+ char mapping[64];
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+ write(uid_map_fd, mapping, strlen(mapping));
+ close(uid_map_fd);
+
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+ write(gid_map_fd, mapping, strlen(mapping));
+ close(gid_map_fd);
+ }
+
+ /* Get file handle */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /* Namespace should be inactive after all tasks exit */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test PID namespace active ref tracking
+ */
+TEST(pidns_active_ref_lifecycle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new PID namespace */
+ ret = unshare(CLONE_NEWPID);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Fork to actually enter the PID namespace */
+ pid_t child = fork();
+ if (child < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (child == 0) {
+ /* Grandchild - in new PID namespace */
+ fd = open("/proc/self/ns/pid", O_RDONLY);
+ if (fd < 0) {
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ exit(1);
+ }
+
+ /* Send handle to grandparent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Wait for grandchild */
+ waitpid(child, NULL, 0);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /* Namespace should be inactive after all processes exit */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that an open file descriptor keeps a namespace active.
+ * Even after the creating process exits, the namespace should remain
+ * active as long as an fd is held open.
+ */
+TEST(ns_fd_keeps_active)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int nsfd;
+ int pipe_child_ready[2];
+ int pipe_parent_ready[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+ char proc_path[64];
+
+ ASSERT_EQ(pipe(pipe_child_ready), 0);
+ ASSERT_EQ(pipe(pipe_parent_ready), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipe_child_ready[0]);
+ close(pipe_parent_ready[1]);
+
+ TH_LOG("Child: creating new network namespace");
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ TH_LOG("Child: unshare(CLONE_NEWNET) failed: %s", strerror(errno));
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+ exit(1);
+ }
+
+ TH_LOG("Child: network namespace created successfully");
+
+ /* Get file handle for the namespace */
+ nsfd = open("/proc/self/ns/net", O_RDONLY);
+ if (nsfd < 0) {
+ TH_LOG("Child: failed to open /proc/self/ns/net: %s", strerror(errno));
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+ exit(1);
+ }
+
+ TH_LOG("Child: opened namespace fd %d", nsfd);
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(nsfd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(nsfd);
+
+ if (ret < 0) {
+ TH_LOG("Child: name_to_handle_at failed: %s", strerror(errno));
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+ exit(1);
+ }
+
+ TH_LOG("Child: got file handle (bytes=%u)", handle->handle_bytes);
+
+ /* Send file handle to parent */
+ ret = write(pipe_child_ready[1], buf, sizeof(*handle) + handle->handle_bytes);
+ TH_LOG("Child: sent %d bytes of file handle to parent", ret);
+ close(pipe_child_ready[1]);
+
+ /* Wait for parent to open the fd */
+ TH_LOG("Child: waiting for parent to open fd");
+ ret = read(pipe_parent_ready[0], &sync_byte, 1);
+ close(pipe_parent_ready[0]);
+
+ TH_LOG("Child: parent signaled (read %d bytes), exiting now", ret);
+ /* Exit - namespace should stay active because parent holds fd */
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+
+ TH_LOG("Parent: reading file handle from child");
+
+ /* Read file handle from child */
+ ret = read(pipe_child_ready[0], buf, sizeof(buf));
+ close(pipe_child_ready[0]);
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ TH_LOG("Parent: received %d bytes, handle size=%u", ret, handle->handle_bytes);
+
+ /* Open the child's namespace while it's still alive */
+ snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/net", pid);
+ TH_LOG("Parent: opening child's namespace at %s", proc_path);
+ nsfd = open(proc_path, O_RDONLY);
+ if (nsfd < 0) {
+ TH_LOG("Parent: failed to open %s: %s", proc_path, strerror(errno));
+ close(pipe_parent_ready[1]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child's namespace");
+ }
+
+ TH_LOG("Parent: opened child's namespace, got fd %d", nsfd);
+
+ /* Signal child that we have the fd */
+ sync_byte = 'G';
+ write(pipe_parent_ready[1], &sync_byte, 1);
+ close(pipe_parent_ready[1]);
+ TH_LOG("Parent: signaled child that we have the fd");
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ TH_LOG("Child exited, parent holds fd %d to namespace", nsfd);
+
+ /*
+ * Namespace should still be ACTIVE because we hold an fd.
+ * We should be able to reopen it via file handle.
+ */
+ TH_LOG("Attempting to reopen namespace via file handle (should succeed - fd held)");
+ int fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(fd2, 0);
+
+ TH_LOG("Successfully reopened namespace via file handle, got fd %d", fd2);
+
+ /* Verify it's the same namespace */
+ struct stat st1, st2;
+ ASSERT_EQ(fstat(nsfd, &st1), 0);
+ ASSERT_EQ(fstat(fd2, &st2), 0);
+ TH_LOG("Namespace inodes: nsfd=%lu, fd2=%lu", st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ close(fd2);
+
+ /* Now close the fd - namespace should become inactive */
+ TH_LOG("Closing fd %d - namespace should become inactive", nsfd);
+ close(nsfd);
+
+ /* Now reopening should fail - namespace is inactive */
+ TH_LOG("Attempting to reopen namespace via file handle (should fail - inactive)");
+ fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd2, 0);
+ /* Should fail with ENOENT (inactive) or ESTALE (gone) */
+ TH_LOG("Reopen failed as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test hierarchical active reference propagation.
+ * When a child namespace is active, its owning user namespace should also
+ * be active automatically due to hierarchical active reference propagation.
+ * This ensures parents are always reachable when children are active.
+ */
+TEST(ns_parent_always_reachable)
+{
+ struct file_handle *parent_handle, *child_handle;
+ int ret;
+ int child_nsfd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 parent_id, child_id;
+ char parent_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ];
+ char child_buf[sizeof(*child_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ TH_LOG("Child: creating parent user namespace and setting up mappings");
+
+ /* Create parent user namespace with mappings */
+ ret = setup_userns();
+ if (ret < 0) {
+ TH_LOG("Child: setup_userns() for parent failed: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: parent user namespace created, now uid=%d gid=%d", getuid(), getgid());
+
+ /* Get namespace ID for parent user namespace */
+ int parent_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (parent_fd < 0) {
+ TH_LOG("Child: failed to open parent /proc/self/ns/user: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: opened parent userns fd %d", parent_fd);
+
+ if (ioctl(parent_fd, NS_GET_ID, &parent_id) < 0) {
+ TH_LOG("Child: NS_GET_ID for parent failed: %s", strerror(errno));
+ close(parent_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(parent_fd);
+
+ TH_LOG("Child: got parent namespace ID %llu", (unsigned long long)parent_id);
+
+ /* Create child user namespace within parent */
+ TH_LOG("Child: creating nested child user namespace");
+ ret = setup_userns();
+ if (ret < 0) {
+ TH_LOG("Child: setup_userns() for child failed: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: nested child user namespace created, uid=%d gid=%d", getuid(), getgid());
+
+ /* Get namespace ID for child user namespace */
+ int child_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (child_fd < 0) {
+ TH_LOG("Child: failed to open child /proc/self/ns/user: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: opened child userns fd %d", child_fd);
+
+ if (ioctl(child_fd, NS_GET_ID, &child_id) < 0) {
+ TH_LOG("Child: NS_GET_ID for child failed: %s", strerror(errno));
+ close(child_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(child_fd);
+
+ TH_LOG("Child: got child namespace ID %llu", (unsigned long long)child_id);
+
+ /* Send both namespace IDs to parent */
+ TH_LOG("Child: sending both namespace IDs to parent");
+ write(pipefd[1], &parent_id, sizeof(parent_id));
+ write(pipefd[1], &child_id, sizeof(child_id));
+ close(pipefd[1]);
+
+ TH_LOG("Child: exiting - parent userns should become inactive");
+ /* Exit - parent user namespace should become inactive */
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ TH_LOG("Parent: reading both namespace IDs from child");
+
+ /* Read both namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &parent_id, sizeof(parent_id));
+ if (ret != sizeof(parent_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &child_id, sizeof(child_id));
+ close(pipefd[0]);
+ if (ret != sizeof(child_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read child namespace ID from child");
+ }
+
+ TH_LOG("Parent: received parent_id=%llu, child_id=%llu",
+ (unsigned long long)parent_id, (unsigned long long)child_id);
+
+ /* Construct file handles from namespace IDs */
+ parent_handle = (struct file_handle *)parent_buf;
+ parent_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ parent_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *parent_fh = (struct nsfs_file_handle *)parent_handle->f_handle;
+ parent_fh->ns_id = parent_id;
+ parent_fh->ns_type = 0;
+ parent_fh->ns_inum = 0;
+
+ child_handle = (struct file_handle *)child_buf;
+ child_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ child_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *child_fh = (struct nsfs_file_handle *)child_handle->f_handle;
+ child_fh->ns_id = child_id;
+ child_fh->ns_type = 0;
+ child_fh->ns_inum = 0;
+
+ TH_LOG("Parent: opening child namespace BEFORE child exits");
+
+ /* Open child namespace while child is still alive to keep it active */
+ child_nsfd = open_by_handle_at(FD_NSFS_ROOT, child_handle, O_RDONLY);
+ if (child_nsfd < 0) {
+ TH_LOG("Failed to open child namespace: %s (errno=%d)", strerror(errno), errno);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child namespace");
+ }
+
+ TH_LOG("Opened child namespace fd %d", child_nsfd);
+
+ /* Now wait for child to exit */
+ TH_LOG("Parent: waiting for child to exit");
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ TH_LOG("Child process exited, parent holds fd to child namespace");
+
+ /*
+ * With hierarchical active reference propagation:
+ * Since the child namespace is active (parent process holds fd),
+ * the parent user namespace should ALSO be active automatically.
+ * This is because when we took an active reference on the child,
+ * it propagated up to the owning user namespace.
+ */
+ TH_LOG("Attempting to reopen parent namespace (should SUCCEED - hierarchical propagation)");
+ int parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(parent_fd, 0);
+
+ TH_LOG("SUCCESS: Parent namespace is active (fd=%d) due to active child", parent_fd);
+
+ /* Verify we can also get parent via NS_GET_USERNS */
+ TH_LOG("Verifying NS_GET_USERNS also works");
+ int parent_fd2 = ioctl(child_nsfd, NS_GET_USERNS);
+ if (parent_fd2 < 0) {
+ close(parent_fd);
+ close(child_nsfd);
+ TH_LOG("NS_GET_USERNS failed: %s (errno=%d)", strerror(errno), errno);
+ SKIP(return, "NS_GET_USERNS not supported or failed");
+ }
+
+ TH_LOG("NS_GET_USERNS succeeded, got parent fd %d", parent_fd2);
+
+ /* Verify both methods give us the same namespace */
+ struct stat st1, st2;
+ ASSERT_EQ(fstat(parent_fd, &st1), 0);
+ ASSERT_EQ(fstat(parent_fd2, &st2), 0);
+ TH_LOG("Parent namespace inodes: parent_fd=%lu, parent_fd2=%lu", st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ /*
+ * Close child fd - parent should remain active because we still
+ * hold direct references to it (parent_fd and parent_fd2).
+ */
+ TH_LOG("Closing child fd - parent should remain active (direct refs held)");
+ close(child_nsfd);
+
+ /* Parent should still be openable */
+ TH_LOG("Verifying parent still active via file handle");
+ int parent_fd3 = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(parent_fd3, 0);
+ close(parent_fd3);
+
+ TH_LOG("Closing all fds to parent namespace");
+ close(parent_fd);
+ close(parent_fd2);
+
+ /* Both should now be inactive */
+ TH_LOG("Attempting to reopen parent (should fail - inactive, no refs)");
+ parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_LT(parent_fd, 0);
+ TH_LOG("Parent inactive as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that bind mounts keep namespaces in the tree even when inactive
+ */
+TEST(ns_bind_mount_keeps_in_tree)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+ char tmpfile[] = "/tmp/ns-test-XXXXXX";
+ int tmpfd;
+
+ /* Create temporary file for bind mount */
+ tmpfd = mkstemp(tmpfile);
+ if (tmpfd < 0) {
+ SKIP(return, "Cannot create temporary file");
+ }
+ close(tmpfd);
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Unshare mount namespace and make mounts private to avoid propagation */
+ ret = unshare(CLONE_NEWNS);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+ ret = mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Bind mount the namespace */
+ ret = mount("/proc/self/ns/net", tmpfile, NULL, MS_BIND, NULL);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Get file handle */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ umount(tmpfile);
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ umount(tmpfile);
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /*
+ * Namespace should be inactive but still in tree due to bind mount.
+ * Reopening should fail with ENOENT (inactive) not ESTALE (not in tree).
+ */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ /* Should be ENOENT (inactive) since bind mount keeps it in tree */
+ if (errno != ENOENT && errno != ESTALE) {
+ TH_LOG("Unexpected error: %d", errno);
+ }
+
+ /* Cleanup */
+ umount(tmpfile);
+ unlink(tmpfile);
+}
+
+/*
+ * Test multi-level hierarchy (3+ levels deep).
+ * Grandparent → Parent → Child
+ * When child is active, both parent AND grandparent should be active.
+ */
+TEST(ns_multilevel_hierarchy)
+{
+ struct file_handle *gp_handle, *p_handle, *c_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 gp_id, p_id, c_id;
+ char gp_buf[sizeof(*gp_handle) + MAX_HANDLE_SZ];
+ char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ];
+ char c_buf[sizeof(*c_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create grandparent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int gp_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (gp_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(gp_fd, NS_GET_ID, &gp_id) < 0) {
+ close(gp_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(gp_fd);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int p_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (p_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) {
+ close(p_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(p_fd);
+
+ /* Create child user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int c_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (c_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(c_fd, NS_GET_ID, &c_id) < 0) {
+ close(c_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(c_fd);
+
+ /* Send all three namespace IDs */
+ write(pipefd[1], &gp_id, sizeof(gp_id));
+ write(pipefd[1], &p_id, sizeof(p_id));
+ write(pipefd[1], &c_id, sizeof(c_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &gp_id, sizeof(gp_id));
+ if (ret != sizeof(gp_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read grandparent namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &p_id, sizeof(p_id));
+ if (ret != sizeof(p_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &c_id, sizeof(c_id));
+ close(pipefd[0]);
+ if (ret != sizeof(c_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read child namespace ID from child");
+ }
+
+ /* Construct file handles from namespace IDs */
+ gp_handle = (struct file_handle *)gp_buf;
+ gp_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ gp_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *gp_fh = (struct nsfs_file_handle *)gp_handle->f_handle;
+ gp_fh->ns_id = gp_id;
+ gp_fh->ns_type = 0;
+ gp_fh->ns_inum = 0;
+
+ p_handle = (struct file_handle *)p_buf;
+ p_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ p_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle;
+ p_fh->ns_id = p_id;
+ p_fh->ns_type = 0;
+ p_fh->ns_inum = 0;
+
+ c_handle = (struct file_handle *)c_buf;
+ c_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ c_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *c_fh = (struct nsfs_file_handle *)c_handle->f_handle;
+ c_fh->ns_id = c_id;
+ c_fh->ns_type = 0;
+ c_fh->ns_inum = 0;
+
+ /* Open child before process exits */
+ int c_fd = open_by_handle_at(FD_NSFS_ROOT, c_handle, O_RDONLY);
+ if (c_fd < 0) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child namespace");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /*
+ * With 3-level hierarchy and child active:
+ * - Child is active (we hold fd)
+ * - Parent should be active (propagated from child)
+ * - Grandparent should be active (propagated from parent)
+ */
+ TH_LOG("Testing parent active when child is active");
+ int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+
+ TH_LOG("Testing grandparent active when child is active");
+ int gp_fd = open_by_handle_at(FD_NSFS_ROOT, gp_handle, O_RDONLY);
+ ASSERT_GE(gp_fd, 0);
+
+ close(c_fd);
+ close(p_fd);
+ close(gp_fd);
+}
+
+/*
+ * Test multiple children sharing same parent.
+ * Parent should stay active as long as ANY child is active.
+ */
+TEST(ns_multiple_children_same_parent)
+{
+ struct file_handle *p_handle, *c1_handle, *c2_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 p_id, c1_id, c2_id;
+ char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ];
+ char c1_buf[sizeof(*c1_handle) + MAX_HANDLE_SZ];
+ char c2_buf[sizeof(*c2_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int p_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (p_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) {
+ close(p_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(p_fd);
+
+ /* Create first child user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int c1_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (c1_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(c1_fd, NS_GET_ID, &c1_id) < 0) {
+ close(c1_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(c1_fd);
+
+ /* Return to parent user namespace and create second child */
+ /* We can't actually do this easily, so let's create a sibling namespace
+ * by creating a network namespace instead */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int c2_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (c2_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(c2_fd, NS_GET_ID, &c2_id) < 0) {
+ close(c2_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(c2_fd);
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &p_id, sizeof(p_id));
+ write(pipefd[1], &c1_id, sizeof(c1_id));
+ write(pipefd[1], &c2_id, sizeof(c2_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &p_id, sizeof(p_id));
+ if (ret != sizeof(p_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID");
+ }
+
+ ret = read(pipefd[0], &c1_id, sizeof(c1_id));
+ if (ret != sizeof(c1_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read first child namespace ID");
+ }
+
+ ret = read(pipefd[0], &c2_id, sizeof(c2_id));
+ close(pipefd[0]);
+ if (ret != sizeof(c2_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read second child namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ p_handle = (struct file_handle *)p_buf;
+ p_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ p_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle;
+ p_fh->ns_id = p_id;
+ p_fh->ns_type = 0;
+ p_fh->ns_inum = 0;
+
+ c1_handle = (struct file_handle *)c1_buf;
+ c1_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ c1_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *c1_fh = (struct nsfs_file_handle *)c1_handle->f_handle;
+ c1_fh->ns_id = c1_id;
+ c1_fh->ns_type = 0;
+ c1_fh->ns_inum = 0;
+
+ c2_handle = (struct file_handle *)c2_buf;
+ c2_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ c2_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *c2_fh = (struct nsfs_file_handle *)c2_handle->f_handle;
+ c2_fh->ns_id = c2_id;
+ c2_fh->ns_type = 0;
+ c2_fh->ns_inum = 0;
+
+ /* Open both children before process exits */
+ int c1_fd = open_by_handle_at(FD_NSFS_ROOT, c1_handle, O_RDONLY);
+ int c2_fd = open_by_handle_at(FD_NSFS_ROOT, c2_handle, O_RDONLY);
+
+ if (c1_fd < 0 || c2_fd < 0) {
+ if (c1_fd >= 0) close(c1_fd);
+ if (c2_fd >= 0) close(c2_fd);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child namespaces");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Parent should be active (both children active) */
+ TH_LOG("Both children active - parent should be active");
+ int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close first child - parent should STILL be active */
+ TH_LOG("Closing first child - parent should still be active");
+ close(c1_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close second child - NOW parent should become inactive */
+ TH_LOG("Closing second child - parent should become inactive");
+ close(c2_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_LT(p_fd, 0);
+}
+
+/*
+ * Test that different namespace types with same owner all contribute
+ * active references to the owning user namespace.
+ */
+TEST(ns_different_types_same_owner)
+{
+ struct file_handle *u_handle, *n_handle, *ut_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 u_id, n_id, ut_id;
+ char u_buf[sizeof(*u_handle) + MAX_HANDLE_SZ];
+ char n_buf[sizeof(*n_handle) + MAX_HANDLE_SZ];
+ char ut_buf[sizeof(*ut_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int u_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (u_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) {
+ close(u_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(u_fd);
+
+ /* Create network namespace (owned by user namespace) */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int n_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) {
+ close(n_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(n_fd);
+
+ /* Create UTS namespace (also owned by user namespace) */
+ if (unshare(CLONE_NEWUTS) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ut_fd = open("/proc/self/ns/uts", O_RDONLY);
+ if (ut_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) {
+ close(ut_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ut_fd);
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &u_id, sizeof(u_id));
+ write(pipefd[1], &n_id, sizeof(n_id));
+ write(pipefd[1], &ut_id, sizeof(ut_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &u_id, sizeof(u_id));
+ if (ret != sizeof(u_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user namespace ID");
+ }
+
+ ret = read(pipefd[0], &n_id, sizeof(n_id));
+ if (ret != sizeof(n_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID");
+ }
+
+ ret = read(pipefd[0], &ut_id, sizeof(ut_id));
+ close(pipefd[0]);
+ if (ret != sizeof(ut_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read UTS namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ u_handle = (struct file_handle *)u_buf;
+ u_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ u_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)u_handle->f_handle;
+ u_fh->ns_id = u_id;
+ u_fh->ns_type = 0;
+ u_fh->ns_inum = 0;
+
+ n_handle = (struct file_handle *)n_buf;
+ n_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ n_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)n_handle->f_handle;
+ n_fh->ns_id = n_id;
+ n_fh->ns_type = 0;
+ n_fh->ns_inum = 0;
+
+ ut_handle = (struct file_handle *)ut_buf;
+ ut_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ut_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)ut_handle->f_handle;
+ ut_fh->ns_id = ut_id;
+ ut_fh->ns_type = 0;
+ ut_fh->ns_inum = 0;
+
+ /* Open both non-user namespaces before process exits */
+ int n_fd = open_by_handle_at(FD_NSFS_ROOT, n_handle, O_RDONLY);
+ int ut_fd = open_by_handle_at(FD_NSFS_ROOT, ut_handle, O_RDONLY);
+
+ if (n_fd < 0 || ut_fd < 0) {
+ if (n_fd >= 0) close(n_fd);
+ if (ut_fd >= 0) close(ut_fd);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open namespaces");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /*
+ * Both network and UTS namespaces are active.
+ * User namespace should be active (gets 2 active refs).
+ */
+ TH_LOG("Both net and uts active - user namespace should be active");
+ int u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close network namespace - user namespace should STILL be active */
+ TH_LOG("Closing network ns - user ns should still be active (uts still active)");
+ close(n_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close UTS namespace - user namespace should become inactive */
+ TH_LOG("Closing uts ns - user ns should become inactive");
+ close(ut_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY);
+ ASSERT_LT(u_fd, 0);
+}
+
+/*
+ * Test hierarchical propagation with deep namespace hierarchy.
+ * Create: init_user_ns -> user_A -> user_B -> net_ns
+ * When net_ns is active, both user_A and user_B should be active.
+ * This verifies the conditional recursion in __ns_ref_active_put() works.
+ */
+TEST(ns_deep_hierarchy_propagation)
+{
+ struct file_handle *ua_handle, *ub_handle, *net_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 ua_id, ub_id, net_id;
+ char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ];
+ char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ];
+ char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create user_A -> user_B -> net hierarchy */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ua_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ua_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) {
+ close(ua_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ua_fd);
+
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ub_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ub_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) {
+ close(ub_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ub_fd);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int net_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (net_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) {
+ close(net_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(net_fd);
+
+ /* Send all three namespace IDs */
+ write(pipefd[1], &ua_id, sizeof(ua_id));
+ write(pipefd[1], &ub_id, sizeof(ub_id));
+ write(pipefd[1], &net_id, sizeof(net_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &ua_id, sizeof(ua_id));
+ if (ret != sizeof(ua_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_A namespace ID");
+ }
+
+ ret = read(pipefd[0], &ub_id, sizeof(ub_id));
+ if (ret != sizeof(ub_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_B namespace ID");
+ }
+
+ ret = read(pipefd[0], &net_id, sizeof(net_id));
+ close(pipefd[0]);
+ if (ret != sizeof(net_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ ua_handle = (struct file_handle *)ua_buf;
+ ua_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ua_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle;
+ ua_fh->ns_id = ua_id;
+ ua_fh->ns_type = 0;
+ ua_fh->ns_inum = 0;
+
+ ub_handle = (struct file_handle *)ub_buf;
+ ub_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ub_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle;
+ ub_fh->ns_id = ub_id;
+ ub_fh->ns_type = 0;
+ ub_fh->ns_inum = 0;
+
+ net_handle = (struct file_handle *)net_buf;
+ net_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle;
+ net_fh->ns_id = net_id;
+ net_fh->ns_type = 0;
+ net_fh->ns_inum = 0;
+
+ /* Open net_ns before child exits to keep it active */
+ int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ if (net_fd < 0) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open network namespace");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* With net_ns active, both user_A and user_B should be active */
+ TH_LOG("Testing user_B active (net_ns active causes propagation)");
+ int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY);
+ ASSERT_GE(ub_fd, 0);
+
+ TH_LOG("Testing user_A active (propagated through user_B)");
+ int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd, 0);
+
+ /* Close net_ns - user_B should stay active (we hold direct ref) */
+ TH_LOG("Closing net_ns, user_B should remain active (direct ref held)");
+ close(net_fd);
+ int ub_fd2 = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY);
+ ASSERT_GE(ub_fd2, 0);
+ close(ub_fd2);
+
+ /* Close user_B - user_A should stay active (we hold direct ref) */
+ TH_LOG("Closing user_B, user_A should remain active (direct ref held)");
+ close(ub_fd);
+ int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd2, 0);
+ close(ua_fd2);
+
+ /* Close user_A - everything should become inactive */
+ TH_LOG("Closing user_A, all should become inactive");
+ close(ua_fd);
+
+ /* All should now be inactive */
+ ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_LT(ua_fd, 0);
+}
+
+/*
+ * Test that parent stays active as long as ANY child is active.
+ * Create parent user namespace with two child net namespaces.
+ * Parent should remain active until BOTH children are inactive.
+ */
+TEST(ns_parent_multiple_children_refcount)
+{
+ struct file_handle *parent_handle, *net1_handle, *net2_handle;
+ int ret, pipefd[2], syncpipe[2];
+ pid_t pid;
+ int status;
+ __u64 p_id, n1_id, n2_id;
+ char p_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ];
+ char n1_buf[sizeof(*net1_handle) + MAX_HANDLE_SZ];
+ char n2_buf[sizeof(*net2_handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+ close(syncpipe[1]);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int p_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (p_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) {
+ close(p_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(p_fd);
+
+ /* Create first network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ int n1_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n1_fd < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ if (ioctl(n1_fd, NS_GET_ID, &n1_id) < 0) {
+ close(n1_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ /* Keep n1_fd open so first namespace stays active */
+
+ /* Create second network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(n1_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ int n2_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n2_fd < 0) {
+ close(n1_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ if (ioctl(n2_fd, NS_GET_ID, &n2_id) < 0) {
+ close(n1_fd);
+ close(n2_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ /* Keep both n1_fd and n2_fd open */
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &p_id, sizeof(p_id));
+ write(pipefd[1], &n1_id, sizeof(n1_id));
+ write(pipefd[1], &n2_id, sizeof(n2_id));
+ close(pipefd[1]);
+
+ /* Wait for parent to signal before exiting */
+ read(syncpipe[0], &sync_byte, 1);
+ close(syncpipe[0]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+ close(syncpipe[0]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &p_id, sizeof(p_id));
+ if (ret != sizeof(p_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID");
+ }
+
+ ret = read(pipefd[0], &n1_id, sizeof(n1_id));
+ if (ret != sizeof(n1_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read first network namespace ID");
+ }
+
+ ret = read(pipefd[0], &n2_id, sizeof(n2_id));
+ close(pipefd[0]);
+ if (ret != sizeof(n2_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read second network namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ parent_handle = (struct file_handle *)p_buf;
+ parent_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ parent_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)parent_handle->f_handle;
+ p_fh->ns_id = p_id;
+ p_fh->ns_type = 0;
+ p_fh->ns_inum = 0;
+
+ net1_handle = (struct file_handle *)n1_buf;
+ net1_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net1_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n1_fh = (struct nsfs_file_handle *)net1_handle->f_handle;
+ n1_fh->ns_id = n1_id;
+ n1_fh->ns_type = 0;
+ n1_fh->ns_inum = 0;
+
+ net2_handle = (struct file_handle *)n2_buf;
+ net2_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net2_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n2_fh = (struct nsfs_file_handle *)net2_handle->f_handle;
+ n2_fh->ns_id = n2_id;
+ n2_fh->ns_type = 0;
+ n2_fh->ns_inum = 0;
+
+ /* Open both net namespaces while child is still alive */
+ int n1_fd = open_by_handle_at(FD_NSFS_ROOT, net1_handle, O_RDONLY);
+ int n2_fd = open_by_handle_at(FD_NSFS_ROOT, net2_handle, O_RDONLY);
+ if (n1_fd < 0 || n2_fd < 0) {
+ if (n1_fd >= 0) close(n1_fd);
+ if (n2_fd >= 0) close(n2_fd);
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open net namespaces");
+ }
+
+ /* Signal child that we have opened the namespaces */
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Parent should be active (has 2 active children) */
+ TH_LOG("Both net namespaces active - parent should be active");
+ int p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close first net namespace - parent should STILL be active */
+ TH_LOG("Closing first net ns - parent should still be active");
+ close(n1_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close second net namespace - parent should become inactive */
+ TH_LOG("Closing second net ns - parent should become inactive");
+ close(n2_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_LT(p_fd, 0);
+}
+
+/*
+ * Test that user namespace as a child also propagates correctly.
+ * Create user_A -> user_B, verify when user_B is active that user_A
+ * is also active. This is different from non-user namespace children.
+ */
+TEST(ns_userns_child_propagation)
+{
+ struct file_handle *ua_handle, *ub_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 ua_id, ub_id;
+ char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ];
+ char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create user_A */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ua_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ua_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) {
+ close(ua_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ua_fd);
+
+ /* Create user_B (child of user_A) */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ub_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ub_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) {
+ close(ub_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ub_fd);
+
+ /* Send both namespace IDs */
+ write(pipefd[1], &ua_id, sizeof(ua_id));
+ write(pipefd[1], &ub_id, sizeof(ub_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read both namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &ua_id, sizeof(ua_id));
+ if (ret != sizeof(ua_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_A namespace ID");
+ }
+
+ ret = read(pipefd[0], &ub_id, sizeof(ub_id));
+ close(pipefd[0]);
+ if (ret != sizeof(ub_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_B namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ ua_handle = (struct file_handle *)ua_buf;
+ ua_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ua_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle;
+ ua_fh->ns_id = ua_id;
+ ua_fh->ns_type = 0;
+ ua_fh->ns_inum = 0;
+
+ ub_handle = (struct file_handle *)ub_buf;
+ ub_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ub_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle;
+ ub_fh->ns_id = ub_id;
+ ub_fh->ns_type = 0;
+ ub_fh->ns_inum = 0;
+
+ /* Open user_B before child exits */
+ int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY);
+ if (ub_fd < 0) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open user_B");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* With user_B active, user_A should also be active */
+ TH_LOG("Testing user_A active when child user_B is active");
+ int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd, 0);
+
+ /* Close user_B */
+ TH_LOG("Closing user_B");
+ close(ub_fd);
+
+ /* user_A should remain active (we hold direct ref) */
+ int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd2, 0);
+ close(ua_fd2);
+
+ /* Close user_A - should become inactive */
+ TH_LOG("Closing user_A - should become inactive");
+ close(ua_fd);
+
+ ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_LT(ua_fd, 0);
+}
+
+/*
+ * Test different namespace types (net, uts, ipc) all contributing
+ * active references to the same owning user namespace.
+ */
+TEST(ns_mixed_types_same_owner)
+{
+ struct file_handle *user_handle, *net_handle, *uts_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 u_id, n_id, ut_id;
+ char u_buf[sizeof(*user_handle) + MAX_HANDLE_SZ];
+ char n_buf[sizeof(*net_handle) + MAX_HANDLE_SZ];
+ char ut_buf[sizeof(*uts_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int u_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (u_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) {
+ close(u_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(u_fd);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int n_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) {
+ close(n_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(n_fd);
+
+ if (unshare(CLONE_NEWUTS) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ut_fd = open("/proc/self/ns/uts", O_RDONLY);
+ if (ut_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) {
+ close(ut_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ut_fd);
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &u_id, sizeof(u_id));
+ write(pipefd[1], &n_id, sizeof(n_id));
+ write(pipefd[1], &ut_id, sizeof(ut_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &u_id, sizeof(u_id));
+ if (ret != sizeof(u_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user namespace ID");
+ }
+
+ ret = read(pipefd[0], &n_id, sizeof(n_id));
+ if (ret != sizeof(n_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID");
+ }
+
+ ret = read(pipefd[0], &ut_id, sizeof(ut_id));
+ close(pipefd[0]);
+ if (ret != sizeof(ut_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read UTS namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ user_handle = (struct file_handle *)u_buf;
+ user_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ user_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)user_handle->f_handle;
+ u_fh->ns_id = u_id;
+ u_fh->ns_type = 0;
+ u_fh->ns_inum = 0;
+
+ net_handle = (struct file_handle *)n_buf;
+ net_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)net_handle->f_handle;
+ n_fh->ns_id = n_id;
+ n_fh->ns_type = 0;
+ n_fh->ns_inum = 0;
+
+ uts_handle = (struct file_handle *)ut_buf;
+ uts_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ uts_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)uts_handle->f_handle;
+ ut_fh->ns_id = ut_id;
+ ut_fh->ns_type = 0;
+ ut_fh->ns_inum = 0;
+
+ /* Open both non-user namespaces */
+ int n_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ int ut_fd = open_by_handle_at(FD_NSFS_ROOT, uts_handle, O_RDONLY);
+ if (n_fd < 0 || ut_fd < 0) {
+ if (n_fd >= 0) close(n_fd);
+ if (ut_fd >= 0) close(ut_fd);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open namespaces");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* User namespace should be active (2 active children) */
+ TH_LOG("Both net and uts active - user ns should be active");
+ int u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close net - user ns should STILL be active (uts still active) */
+ TH_LOG("Closing net - user ns should still be active");
+ close(n_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close uts - user ns should become inactive */
+ TH_LOG("Closing uts - user ns should become inactive");
+ close(ut_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_LT(u_fd, 0);
+}
+
+/* Thread test helpers and structures */
+struct thread_ns_info {
+ __u64 ns_id;
+ int pipefd;
+ int syncfd_read;
+ int syncfd_write;
+ int exit_code;
+};
+
+static void *thread_create_namespace(void *arg)
+{
+ struct thread_ns_info *info = (struct thread_ns_info *)arg;
+ int ret;
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ info->exit_code = 1;
+ return NULL;
+ }
+
+ /* Get namespace ID */
+ int fd = open("/proc/thread-self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ info->exit_code = 2;
+ return NULL;
+ }
+
+ ret = ioctl(fd, NS_GET_ID, &info->ns_id);
+ close(fd);
+ if (ret < 0) {
+ info->exit_code = 3;
+ return NULL;
+ }
+
+ /* Send namespace ID to main thread */
+ if (write(info->pipefd, &info->ns_id, sizeof(info->ns_id)) != sizeof(info->ns_id)) {
+ info->exit_code = 4;
+ return NULL;
+ }
+
+ /* Wait for signal to exit */
+ char sync_byte;
+ if (read(info->syncfd_read, &sync_byte, 1) != 1) {
+ info->exit_code = 5;
+ return NULL;
+ }
+
+ info->exit_code = 0;
+ return NULL;
+}
+
+/*
+ * Test that namespace becomes inactive after thread exits.
+ * This verifies active reference counting works with threads, not just processes.
+ */
+TEST(thread_ns_inactive_after_exit)
+{
+ pthread_t thread;
+ struct thread_ns_info info;
+ struct file_handle *handle;
+ int pipefd[2];
+ int syncpipe[2];
+ int ret;
+ char sync_byte;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ info.pipefd = pipefd[1];
+ info.syncfd_read = syncpipe[0];
+ info.syncfd_write = -1;
+ info.exit_code = -1;
+
+ /* Create thread that will create a namespace */
+ ret = pthread_create(&thread, NULL, thread_create_namespace, &info);
+ ASSERT_EQ(ret, 0);
+
+ /* Read namespace ID from thread */
+ __u64 ns_id;
+ ret = read(pipefd[0], &ns_id, sizeof(ns_id));
+ if (ret != sizeof(ns_id)) {
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ pthread_join(thread, NULL);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ close(syncpipe[1]);
+ SKIP(return, "Failed to read namespace ID from thread");
+ }
+
+ TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id);
+
+ /* Construct file handle */
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle;
+ fh->ns_id = ns_id;
+ fh->ns_type = 0;
+ fh->ns_inum = 0;
+
+ /* Namespace should be active while thread is alive */
+ TH_LOG("Attempting to open namespace while thread is alive (should succeed)");
+ int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(nsfd, 0);
+ close(nsfd);
+
+ /* Signal thread to exit */
+ TH_LOG("Signaling thread to exit");
+ sync_byte = 'X';
+ ASSERT_EQ(write(syncpipe[1], &sync_byte, 1), 1);
+ close(syncpipe[1]);
+
+ /* Wait for thread to exit */
+ ASSERT_EQ(pthread_join(thread, NULL), 0);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+
+ if (info.exit_code != 0)
+ SKIP(return, "Thread failed to create namespace");
+
+ TH_LOG("Thread exited, namespace should be inactive");
+
+ /* Namespace should now be inactive */
+ nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(nsfd, 0);
+ /* Should fail with ENOENT (inactive) or ESTALE (gone) */
+ TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that a namespace remains active while a thread holds an fd to it.
+ * Even after the thread exits, the namespace should remain active as long as
+ * another thread holds a file descriptor to it.
+ */
+TEST(thread_ns_fd_keeps_active)
+{
+ pthread_t thread;
+ struct thread_ns_info info;
+ struct file_handle *handle;
+ int pipefd[2];
+ int syncpipe[2];
+ int ret;
+ char sync_byte;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ info.pipefd = pipefd[1];
+ info.syncfd_read = syncpipe[0];
+ info.syncfd_write = -1;
+ info.exit_code = -1;
+
+ /* Create thread that will create a namespace */
+ ret = pthread_create(&thread, NULL, thread_create_namespace, &info);
+ ASSERT_EQ(ret, 0);
+
+ /* Read namespace ID from thread */
+ __u64 ns_id;
+ ret = read(pipefd[0], &ns_id, sizeof(ns_id));
+ if (ret != sizeof(ns_id)) {
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ pthread_join(thread, NULL);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ close(syncpipe[1]);
+ SKIP(return, "Failed to read namespace ID from thread");
+ }
+
+ TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id);
+
+ /* Construct file handle */
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle;
+ fh->ns_id = ns_id;
+ fh->ns_type = 0;
+ fh->ns_inum = 0;
+
+ /* Open namespace while thread is alive */
+ TH_LOG("Opening namespace while thread is alive");
+ int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(nsfd, 0);
+
+ /* Signal thread to exit */
+ TH_LOG("Signaling thread to exit");
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ /* Wait for thread to exit */
+ pthread_join(thread, NULL);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+
+ if (info.exit_code != 0) {
+ close(nsfd);
+ SKIP(return, "Thread failed to create namespace");
+ }
+
+ TH_LOG("Thread exited, but main thread holds fd - namespace should remain active");
+
+ /* Namespace should still be active because we hold an fd */
+ int nsfd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(nsfd2, 0);
+
+ /* Verify it's the same namespace */
+ struct stat st1, st2;
+ ASSERT_EQ(fstat(nsfd, &st1), 0);
+ ASSERT_EQ(fstat(nsfd2, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ close(nsfd2);
+
+ TH_LOG("Closing fd - namespace should become inactive");
+ close(nsfd);
+
+ /* Now namespace should be inactive */
+ nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(nsfd, 0);
+ /* Should fail with ENOENT (inactive) or ESTALE (gone) */
+ TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/* Structure for thread data in subprocess */
+struct thread_sleep_data {
+ int syncfd_read;
+};
+
+static void *thread_sleep_and_wait(void *arg)
+{
+ struct thread_sleep_data *data = (struct thread_sleep_data *)arg;
+ char sync_byte;
+
+ /* Wait for signal to exit - read will unblock when pipe is closed */
+ (void)read(data->syncfd_read, &sync_byte, 1);
+ return NULL;
+}
+
+/*
+ * Test that namespaces become inactive after subprocess with multiple threads exits.
+ * Create a subprocess that unshares user and network namespaces, then creates two
+ * threads that share those namespaces. Verify that after all threads and subprocess
+ * exit, the namespaces are no longer listed by listns() and cannot be opened by
+ * open_by_handle_at().
+ */
+TEST(thread_subprocess_ns_inactive_after_all_exit)
+{
+ int pipefd[2];
+ int sv[2];
+ pid_t pid;
+ int status;
+ __u64 user_id, net_id;
+ struct file_handle *user_handle, *net_handle;
+ char user_buf[sizeof(*user_handle) + MAX_HANDLE_SZ];
+ char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+ int ret;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+ close(sv[0]);
+
+ /* Create user namespace with mappings */
+ if (setup_userns() < 0) {
+ fprintf(stderr, "Child: setup_userns() failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ fprintf(stderr, "Child: setup_userns() succeeded\n");
+
+ /* Get user namespace ID */
+ int user_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (user_fd < 0) {
+ fprintf(stderr, "Child: open(/proc/self/ns/user) failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(user_fd, NS_GET_ID, &user_id) < 0) {
+ fprintf(stderr, "Child: ioctl(NS_GET_ID) for user ns failed: %s\n", strerror(errno));
+ close(user_fd);
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ close(user_fd);
+ fprintf(stderr, "Child: user ns ID = %llu\n", (unsigned long long)user_id);
+
+ /* Unshare network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ fprintf(stderr, "Child: unshare(CLONE_NEWNET) failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ fprintf(stderr, "Child: unshare(CLONE_NEWNET) succeeded\n");
+
+ /* Get network namespace ID */
+ int net_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (net_fd < 0) {
+ fprintf(stderr, "Child: open(/proc/self/ns/net) failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) {
+ fprintf(stderr, "Child: ioctl(NS_GET_ID) for net ns failed: %s\n", strerror(errno));
+ close(net_fd);
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ close(net_fd);
+ fprintf(stderr, "Child: net ns ID = %llu\n", (unsigned long long)net_id);
+
+ /* Send namespace IDs to parent */
+ if (write(pipefd[1], &user_id, sizeof(user_id)) != sizeof(user_id)) {
+ fprintf(stderr, "Child: write(user_id) failed: %s\n", strerror(errno));
+ exit(1);
+ }
+ if (write(pipefd[1], &net_id, sizeof(net_id)) != sizeof(net_id)) {
+ fprintf(stderr, "Child: write(net_id) failed: %s\n", strerror(errno));
+ exit(1);
+ }
+ close(pipefd[1]);
+ fprintf(stderr, "Child: sent namespace IDs to parent\n");
+
+ /* Create two threads that share the namespaces */
+ pthread_t thread1, thread2;
+ struct thread_sleep_data data;
+ data.syncfd_read = sv[1];
+
+ int ret_thread = pthread_create(&thread1, NULL, thread_sleep_and_wait, &data);
+ if (ret_thread != 0) {
+ fprintf(stderr, "Child: pthread_create(thread1) failed: %s\n", strerror(ret_thread));
+ close(sv[1]);
+ exit(1);
+ }
+ fprintf(stderr, "Child: created thread1\n");
+
+ ret_thread = pthread_create(&thread2, NULL, thread_sleep_and_wait, &data);
+ if (ret_thread != 0) {
+ fprintf(stderr, "Child: pthread_create(thread2) failed: %s\n", strerror(ret_thread));
+ close(sv[1]);
+ pthread_cancel(thread1);
+ exit(1);
+ }
+ fprintf(stderr, "Child: created thread2\n");
+
+ /* Wait for threads to complete - they will unblock when parent writes */
+ fprintf(stderr, "Child: waiting for threads to exit\n");
+ pthread_join(thread1, NULL);
+ fprintf(stderr, "Child: thread1 exited\n");
+ pthread_join(thread2, NULL);
+ fprintf(stderr, "Child: thread2 exited\n");
+
+ close(sv[1]);
+
+ /* Exit - namespaces should become inactive */
+ fprintf(stderr, "Child: all threads joined, exiting with success\n");
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+ close(sv[1]);
+
+ TH_LOG("Parent: waiting to read namespace IDs from child");
+
+ /* Read namespace IDs from child */
+ ret = read(pipefd[0], &user_id, sizeof(user_id));
+ if (ret != sizeof(user_id)) {
+ TH_LOG("Parent: failed to read user_id, ret=%d, errno=%s", ret, strerror(errno));
+ close(pipefd[0]);
+ sync_byte = 'X';
+ (void)write(sv[0], &sync_byte, 1);
+ close(sv[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &net_id, sizeof(net_id));
+ close(pipefd[0]);
+ if (ret != sizeof(net_id)) {
+ TH_LOG("Parent: failed to read net_id, ret=%d, errno=%s", ret, strerror(errno));
+ sync_byte = 'X';
+ (void)write(sv[0], &sync_byte, 1);
+ close(sv[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID from child");
+ }
+
+ TH_LOG("Child created user ns %llu and net ns %llu with 2 threads",
+ (unsigned long long)user_id, (unsigned long long)net_id);
+
+ /* Construct file handles */
+ user_handle = (struct file_handle *)user_buf;
+ user_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ user_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *user_fh = (struct nsfs_file_handle *)user_handle->f_handle;
+ user_fh->ns_id = user_id;
+ user_fh->ns_type = 0;
+ user_fh->ns_inum = 0;
+
+ net_handle = (struct file_handle *)net_buf;
+ net_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle;
+ net_fh->ns_id = net_id;
+ net_fh->ns_type = 0;
+ net_fh->ns_inum = 0;
+
+ /* Verify namespaces are active while subprocess and threads are alive */
+ TH_LOG("Verifying namespaces are active while subprocess with threads is running");
+ int user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_GE(user_fd, 0);
+
+ int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ ASSERT_GE(net_fd, 0);
+
+ close(user_fd);
+ close(net_fd);
+
+ /* Also verify they appear in listns() */
+ TH_LOG("Verifying namespaces appear in listns() while active");
+ struct ns_id_req req = {
+ .size = sizeof(struct ns_id_req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids < 0) {
+ TH_LOG("listns() not available, skipping listns verification");
+ } else {
+ /* Check if user_id is in the list */
+ int found_user = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == user_id) {
+ found_user = 1;
+ break;
+ }
+ }
+ ASSERT_TRUE(found_user);
+ TH_LOG("User namespace found in listns() as expected");
+
+ /* Check network namespace */
+ req.ns_type = CLONE_NEWNET;
+ nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids >= 0) {
+ int found_net = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == net_id) {
+ found_net = 1;
+ break;
+ }
+ }
+ ASSERT_TRUE(found_net);
+ TH_LOG("Network namespace found in listns() as expected");
+ }
+ }
+
+ /* Signal threads to exit */
+ TH_LOG("Signaling threads to exit");
+ sync_byte = 'X';
+ /* Write two bytes - one for each thread */
+ ASSERT_EQ(write(sv[0], &sync_byte, 1), 1);
+ ASSERT_EQ(write(sv[0], &sync_byte, 1), 1);
+ close(sv[0]);
+
+ /* Wait for child process to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ if (WEXITSTATUS(status) != 0) {
+ TH_LOG("Child process failed with exit code %d", WEXITSTATUS(status));
+ SKIP(return, "Child process failed");
+ }
+
+ TH_LOG("Subprocess and all threads have exited successfully");
+
+ /* Verify namespaces are now inactive - open_by_handle_at should fail */
+ TH_LOG("Verifying namespaces are inactive after subprocess and threads exit");
+ user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_LT(user_fd, 0);
+ TH_LOG("User namespace inactive as expected: %s (errno=%d)",
+ strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+
+ net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ ASSERT_LT(net_fd, 0);
+ TH_LOG("Network namespace inactive as expected: %s (errno=%d)",
+ strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+
+ /* Verify namespaces do NOT appear in listns() */
+ TH_LOG("Verifying namespaces do NOT appear in listns() when inactive");
+ memset(&req, 0, sizeof(req));
+ req.size = sizeof(struct ns_id_req);
+ req.ns_type = CLONE_NEWUSER;
+ nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids >= 0) {
+ int found_user = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == user_id) {
+ found_user = 1;
+ break;
+ }
+ }
+ ASSERT_FALSE(found_user);
+ TH_LOG("User namespace correctly not listed in listns()");
+
+ /* Check network namespace */
+ req.ns_type = CLONE_NEWNET;
+ nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids >= 0) {
+ int found_net = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == net_id) {
+ found_net = 1;
+ break;
+ }
+ }
+ ASSERT_FALSE(found_net);
+ TH_LOG("Network namespace correctly not listed in listns()");
+ }
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c
index e28accd74a57..527ade0a8673 100644
--- a/tools/testing/selftests/namespaces/nsid_test.c
+++ b/tools/testing/selftests/namespaces/nsid_test.c
@@ -6,6 +6,7 @@
#include <libgen.h>
#include <limits.h>
#include <pthread.h>
+#include <signal.h>
#include <string.h>
#include <sys/mount.h>
#include <poll.h>
@@ -14,12 +15,30 @@
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/un.h>
+#include <sys/wait.h>
#include <unistd.h>
#include <linux/fs.h>
#include <linux/limits.h>
#include <linux/nsfs.h>
#include "../kselftest_harness.h"
+/* Fixture for tests that create child processes */
+FIXTURE(nsid) {
+ pid_t child_pid;
+};
+
+FIXTURE_SETUP(nsid) {
+ self->child_pid = 0;
+}
+
+FIXTURE_TEARDOWN(nsid) {
+ /* Clean up any child process that may still be running */
+ if (self->child_pid > 0) {
+ kill(self->child_pid, SIGKILL);
+ waitpid(self->child_pid, NULL, 0);
+ }
+}
+
TEST(nsid_mntns_basic)
{
__u64 mnt_ns_id = 0;
@@ -44,7 +63,7 @@ TEST(nsid_mntns_basic)
close(fd_mntns);
}
-TEST(nsid_mntns_separate)
+TEST_F(nsid, mntns_separate)
{
__u64 parent_mnt_ns_id = 0;
__u64 child_mnt_ns_id = 0;
@@ -90,6 +109,9 @@ TEST(nsid_mntns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -99,8 +121,6 @@ TEST(nsid_mntns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_mntns);
SKIP(return, "No permission to create mount namespace");
}
@@ -123,10 +143,6 @@ TEST(nsid_mntns_separate)
close(fd_parent_mntns);
close(fd_child_mntns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_cgroupns_basic)
@@ -153,7 +169,7 @@ TEST(nsid_cgroupns_basic)
close(fd_cgroupns);
}
-TEST(nsid_cgroupns_separate)
+TEST_F(nsid, cgroupns_separate)
{
__u64 parent_cgroup_ns_id = 0;
__u64 child_cgroup_ns_id = 0;
@@ -199,6 +215,9 @@ TEST(nsid_cgroupns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -208,8 +227,6 @@ TEST(nsid_cgroupns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_cgroupns);
SKIP(return, "No permission to create cgroup namespace");
}
@@ -232,10 +249,6 @@ TEST(nsid_cgroupns_separate)
close(fd_parent_cgroupns);
close(fd_child_cgroupns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_ipcns_basic)
@@ -262,7 +275,7 @@ TEST(nsid_ipcns_basic)
close(fd_ipcns);
}
-TEST(nsid_ipcns_separate)
+TEST_F(nsid, ipcns_separate)
{
__u64 parent_ipc_ns_id = 0;
__u64 child_ipc_ns_id = 0;
@@ -308,6 +321,9 @@ TEST(nsid_ipcns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -317,8 +333,6 @@ TEST(nsid_ipcns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_ipcns);
SKIP(return, "No permission to create IPC namespace");
}
@@ -341,10 +355,6 @@ TEST(nsid_ipcns_separate)
close(fd_parent_ipcns);
close(fd_child_ipcns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_utsns_basic)
@@ -371,7 +381,7 @@ TEST(nsid_utsns_basic)
close(fd_utsns);
}
-TEST(nsid_utsns_separate)
+TEST_F(nsid, utsns_separate)
{
__u64 parent_uts_ns_id = 0;
__u64 child_uts_ns_id = 0;
@@ -417,6 +427,9 @@ TEST(nsid_utsns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -426,8 +439,6 @@ TEST(nsid_utsns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_utsns);
SKIP(return, "No permission to create UTS namespace");
}
@@ -450,10 +461,6 @@ TEST(nsid_utsns_separate)
close(fd_parent_utsns);
close(fd_child_utsns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_userns_basic)
@@ -480,7 +487,7 @@ TEST(nsid_userns_basic)
close(fd_userns);
}
-TEST(nsid_userns_separate)
+TEST_F(nsid, userns_separate)
{
__u64 parent_user_ns_id = 0;
__u64 child_user_ns_id = 0;
@@ -526,6 +533,9 @@ TEST(nsid_userns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -535,8 +545,6 @@ TEST(nsid_userns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_userns);
SKIP(return, "No permission to create user namespace");
}
@@ -559,10 +567,6 @@ TEST(nsid_userns_separate)
close(fd_parent_userns);
close(fd_child_userns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_timens_basic)
@@ -591,7 +595,7 @@ TEST(nsid_timens_basic)
close(fd_timens);
}
-TEST(nsid_timens_separate)
+TEST_F(nsid, timens_separate)
{
__u64 parent_time_ns_id = 0;
__u64 child_time_ns_id = 0;
@@ -652,6 +656,9 @@ TEST(nsid_timens_separate)
}
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -660,8 +667,6 @@ TEST(nsid_timens_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_timens);
close(pipefd[0]);
SKIP(return, "Cannot create time namespace");
@@ -689,10 +694,6 @@ TEST(nsid_timens_separate)
close(fd_parent_timens);
close(fd_child_timens);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_pidns_basic)
@@ -719,7 +720,7 @@ TEST(nsid_pidns_basic)
close(fd_pidns);
}
-TEST(nsid_pidns_separate)
+TEST_F(nsid, pidns_separate)
{
__u64 parent_pid_ns_id = 0;
__u64 child_pid_ns_id = 0;
@@ -776,6 +777,9 @@ TEST(nsid_pidns_separate)
}
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -784,8 +788,6 @@ TEST(nsid_pidns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_pidns);
close(pipefd[0]);
SKIP(return, "No permission to create PID namespace");
@@ -813,10 +815,6 @@ TEST(nsid_pidns_separate)
close(fd_parent_pidns);
close(fd_child_pidns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_netns_basic)
@@ -860,7 +858,7 @@ TEST(nsid_netns_basic)
close(fd_netns);
}
-TEST(nsid_netns_separate)
+TEST_F(nsid, netns_separate)
{
__u64 parent_net_ns_id = 0;
__u64 parent_netns_cookie = 0;
@@ -920,6 +918,9 @@ TEST(nsid_netns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -929,8 +930,6 @@ TEST(nsid_netns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_netns);
close(parent_sock);
SKIP(return, "No permission to create network namespace");
@@ -977,10 +976,6 @@ TEST(nsid_netns_separate)
close(fd_parent_netns);
close(fd_child_netns);
close(parent_sock);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c
new file mode 100644
index 000000000000..753fd29dffd8
--- /dev/null
+++ b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "../pidfd/pidfd.h"
+#include "../kselftest_harness.h"
+
+/*
+ * Regression tests for the setns(pidfd) active reference counting bug.
+ *
+ * These tests are based on the reproducers that triggered the race condition
+ * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly").
+ *
+ * The bug: When using setns() with a pidfd, if the target task exits between
+ * prepare_nsset() and commit_nsset(), the namespaces would become inactive.
+ * Then ns_ref_active_get() would increment from 0 without properly resurrecting
+ * the owner chain, causing active reference count underflows.
+ */
+
+/*
+ * Simple pidfd setns test using create_child()+unshare().
+ *
+ * Without the fix, this would trigger active refcount warnings when the
+ * parent exits after doing setns(pidfd) on a child that has already exited.
+ */
+TEST(simple_pidfd_setns)
+{
+ pid_t child_pid;
+ int pidfd = -1;
+ int ret;
+ int sv[2];
+ char c;
+
+ /* Ignore SIGCHLD for autoreap */
+ ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create a child process without namespaces initially */
+ child_pid = create_child(&pidfd, 0);
+ ASSERT_GE(child_pid, 0);
+
+ if (child_pid == 0) {
+ close(sv[0]);
+
+ if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) {
+ close(sv[1]);
+ _exit(1);
+ }
+
+ /* Signal parent that namespaces are ready */
+ if (write_nointr(sv[1], "1", 1) < 0) {
+ close(sv[1]);
+ _exit(1);
+ }
+
+ close(sv[1]);
+ _exit(0);
+ }
+ ASSERT_GE(pidfd, 0);
+ EXPECT_EQ(close(sv[1]), 0);
+
+ ret = read_nointr(sv[0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ EXPECT_EQ(close(sv[0]), 0);
+
+ /* Set to child's namespaces via pidfd */
+ ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC);
+ TH_LOG("setns() returned %d", ret);
+ close(pidfd);
+}
+
+/*
+ * Simple pidfd setns test using create_child().
+ *
+ * This variation uses create_child() with namespace flags directly.
+ * Namespaces are created immediately at clone time.
+ */
+TEST(simple_pidfd_setns_clone)
+{
+ pid_t child_pid;
+ int pidfd = -1;
+ int ret;
+
+ /* Ignore SIGCHLD for autoreap */
+ ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR);
+
+ /* Create a child process with new namespaces using create_child() */
+ child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET);
+ ASSERT_GE(child_pid, 0);
+
+ if (child_pid == 0) {
+ /* Child: sleep for a while so parent can setns to us */
+ sleep(2);
+ _exit(0);
+ }
+
+ /* Parent: pidfd was already created by create_child() */
+ ASSERT_GE(pidfd, 0);
+
+ /* Set to child's namespaces via pidfd */
+ ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC);
+ close(pidfd);
+ TH_LOG("setns() returned %d", ret);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/siocgskns_test.c b/tools/testing/selftests/namespaces/siocgskns_test.c
new file mode 100644
index 000000000000..ba689a22d82f
--- /dev/null
+++ b/tools/testing/selftests/namespaces/siocgskns_test.c
@@ -0,0 +1,1824 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/if.h>
+#include <linux/sockios.h>
+#include <linux/nsfs.h>
+#include <arpa/inet.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+#ifndef SIOCGSKNS
+#define SIOCGSKNS 0x894C
+#endif
+
+#ifndef FD_NSFS_ROOT
+#define FD_NSFS_ROOT -10003
+#endif
+
+#ifndef FILEID_NSFS
+#define FILEID_NSFS 0xf1
+#endif
+
+/*
+ * Test basic SIOCGSKNS functionality.
+ * Create a socket and verify SIOCGSKNS returns the correct network namespace.
+ */
+TEST(siocgskns_basic)
+{
+ int sock_fd, netns_fd, current_netns_fd;
+ struct stat st1, st2;
+
+ /* Create a TCP socket */
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(sock_fd, 0);
+
+ /* Use SIOCGSKNS to get network namespace */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Get current network namespace */
+ current_netns_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(current_netns_fd, 0);
+
+ /* Verify they match */
+ ASSERT_EQ(fstat(netns_fd, &st1), 0);
+ ASSERT_EQ(fstat(current_netns_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ close(sock_fd);
+ close(netns_fd);
+ close(current_netns_fd);
+}
+
+/*
+ * Test that socket file descriptors keep network namespaces active.
+ * Create a network namespace, create a socket in it, then exit the namespace.
+ * The namespace should remain active while the socket FD is held.
+ */
+TEST(siocgskns_keeps_netns_active)
+{
+ int sock_fd, netns_fd, test_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ struct stat st;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new netns and socket */
+ close(ipc_sockets[0]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno));
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Create a socket in the new network namespace */
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ TH_LOG("socket() failed: %s", strerror(errno));
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ ASSERT_EQ(fstat(netns_fd, &st), 0);
+
+ /*
+ * Namespace should still be active because socket FD keeps it alive.
+ * Try to access it via /proc/self/fd/<fd>.
+ */
+ char path[64];
+ snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fd);
+ test_fd = open(path, O_RDONLY);
+ ASSERT_GE(test_fd, 0);
+ close(test_fd);
+ close(netns_fd);
+
+ /* Close socket - namespace should become inactive */
+ close(sock_fd);
+
+ /* Try SIOCGSKNS again - should fail since socket is closed */
+ ASSERT_LT(ioctl(sock_fd, SIOCGSKNS), 0);
+}
+
+/*
+ * Test SIOCGSKNS with different socket types (TCP, UDP, RAW).
+ */
+TEST(siocgskns_socket_types)
+{
+ int sock_tcp, sock_udp, sock_raw;
+ int netns_tcp, netns_udp, netns_raw;
+ struct stat st_tcp, st_udp, st_raw;
+
+ /* TCP socket */
+ sock_tcp = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(sock_tcp, 0);
+
+ /* UDP socket */
+ sock_udp = socket(AF_INET, SOCK_DGRAM, 0);
+ ASSERT_GE(sock_udp, 0);
+
+ /* RAW socket (may require privileges) */
+ sock_raw = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP);
+ if (sock_raw < 0 && (errno == EPERM || errno == EACCES)) {
+ sock_raw = -1; /* Skip raw socket test */
+ }
+
+ /* Test SIOCGSKNS on TCP */
+ netns_tcp = ioctl(sock_tcp, SIOCGSKNS);
+ if (netns_tcp < 0) {
+ close(sock_tcp);
+ close(sock_udp);
+ if (sock_raw >= 0) close(sock_raw);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_tcp, 0);
+ }
+
+ /* Test SIOCGSKNS on UDP */
+ netns_udp = ioctl(sock_udp, SIOCGSKNS);
+ ASSERT_GE(netns_udp, 0);
+
+ /* Test SIOCGSKNS on RAW (if available) */
+ if (sock_raw >= 0) {
+ netns_raw = ioctl(sock_raw, SIOCGSKNS);
+ ASSERT_GE(netns_raw, 0);
+ }
+
+ /* Verify all return the same network namespace */
+ ASSERT_EQ(fstat(netns_tcp, &st_tcp), 0);
+ ASSERT_EQ(fstat(netns_udp, &st_udp), 0);
+ ASSERT_EQ(st_tcp.st_ino, st_udp.st_ino);
+
+ if (sock_raw >= 0) {
+ ASSERT_EQ(fstat(netns_raw, &st_raw), 0);
+ ASSERT_EQ(st_tcp.st_ino, st_raw.st_ino);
+ close(netns_raw);
+ close(sock_raw);
+ }
+
+ close(netns_tcp);
+ close(netns_udp);
+ close(sock_tcp);
+ close(sock_udp);
+}
+
+/*
+ * Test SIOCGSKNS across setns.
+ * Create a socket in netns A, switch to netns B, verify SIOCGSKNS still
+ * returns netns A.
+ */
+TEST(siocgskns_across_setns)
+{
+ int sock_fd, netns_a_fd, netns_b_fd, result_fd;
+ struct stat st_a;
+
+ /* Get current netns (A) */
+ netns_a_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(netns_a_fd, 0);
+ ASSERT_EQ(fstat(netns_a_fd, &st_a), 0);
+
+ /* Create socket in netns A */
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(sock_fd, 0);
+
+ /* Create new netns (B) */
+ ASSERT_EQ(unshare(CLONE_NEWNET), 0);
+
+ netns_b_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(netns_b_fd, 0);
+
+ /* Get netns from socket created in A */
+ result_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (result_fd < 0) {
+ close(sock_fd);
+ setns(netns_a_fd, CLONE_NEWNET);
+ close(netns_a_fd);
+ close(netns_b_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(result_fd, 0);
+ }
+
+ /* Verify it still points to netns A */
+ struct stat st_result_stat;
+ ASSERT_EQ(fstat(result_fd, &st_result_stat), 0);
+ ASSERT_EQ(st_a.st_ino, st_result_stat.st_ino);
+
+ close(result_fd);
+ close(sock_fd);
+ close(netns_b_fd);
+
+ /* Restore original netns */
+ ASSERT_EQ(setns(netns_a_fd, CLONE_NEWNET), 0);
+ close(netns_a_fd);
+}
+
+/*
+ * Test SIOCGSKNS fails on non-socket file descriptors.
+ */
+TEST(siocgskns_non_socket)
+{
+ int fd;
+ int pipefd[2];
+
+ /* Test on regular file */
+ fd = open("/dev/null", O_RDONLY);
+ ASSERT_GE(fd, 0);
+
+ ASSERT_LT(ioctl(fd, SIOCGSKNS), 0);
+ ASSERT_TRUE(errno == ENOTTY || errno == EINVAL);
+ close(fd);
+
+ /* Test on pipe */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ ASSERT_LT(ioctl(pipefd[0], SIOCGSKNS), 0);
+ ASSERT_TRUE(errno == ENOTTY || errno == EINVAL);
+
+ close(pipefd[0]);
+ close(pipefd[1]);
+}
+
+/*
+ * Test multiple sockets keep the same network namespace active.
+ * Create multiple sockets, verify closing some doesn't affect others.
+ */
+TEST(siocgskns_multiple_sockets)
+{
+ int socks[5];
+ int netns_fds[5];
+ int i;
+ struct stat st;
+ ino_t netns_ino;
+
+ /* Create new network namespace */
+ ASSERT_EQ(unshare(CLONE_NEWNET), 0);
+
+ /* Create multiple sockets */
+ for (i = 0; i < 5; i++) {
+ socks[i] = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(socks[i], 0);
+ }
+
+ /* Get netns from all sockets */
+ for (i = 0; i < 5; i++) {
+ netns_fds[i] = ioctl(socks[i], SIOCGSKNS);
+ if (netns_fds[i] < 0) {
+ int j;
+ for (j = 0; j <= i; j++) {
+ close(socks[j]);
+ if (j < i && netns_fds[j] >= 0)
+ close(netns_fds[j]);
+ }
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fds[i], 0);
+ }
+ }
+
+ /* Verify all point to same netns */
+ ASSERT_EQ(fstat(netns_fds[0], &st), 0);
+ netns_ino = st.st_ino;
+
+ for (i = 1; i < 5; i++) {
+ ASSERT_EQ(fstat(netns_fds[i], &st), 0);
+ ASSERT_EQ(st.st_ino, netns_ino);
+ }
+
+ /* Close some sockets */
+ for (i = 0; i < 3; i++) {
+ close(socks[i]);
+ }
+
+ /* Remaining netns FDs should still be valid */
+ for (i = 3; i < 5; i++) {
+ char path[64];
+ snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fds[i]);
+ int test_fd = open(path, O_RDONLY);
+ ASSERT_GE(test_fd, 0);
+ close(test_fd);
+ }
+
+ /* Cleanup */
+ for (i = 0; i < 5; i++) {
+ if (i >= 3)
+ close(socks[i]);
+ close(netns_fds[i]);
+ }
+}
+
+/*
+ * Test socket keeps netns active after creating process exits.
+ * Verify that as long as the socket FD exists, the namespace remains active.
+ */
+TEST(siocgskns_netns_lifecycle)
+{
+ int sock_fd, netns_fd;
+ int ipc_sockets[2];
+ int syncpipe[2];
+ pid_t pid;
+ int status;
+ char sync_byte;
+ struct stat st;
+ ino_t netns_ino;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child */
+ close(ipc_sockets[0]);
+ close(syncpipe[1]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ /* Send socket to parent */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+
+ /* Wait for parent signal */
+ read(syncpipe[0], &sync_byte, 1);
+ close(syncpipe[0]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+
+ /* Receive socket FD */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Get netns from socket while child is alive */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+ close(sock_fd);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+ ASSERT_EQ(fstat(netns_fd, &st), 0);
+ netns_ino = st.st_ino;
+
+ /* Signal child to exit */
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ /*
+ * Socket FD should still keep namespace active even after
+ * the creating process exited.
+ */
+ int test_fd = ioctl(sock_fd, SIOCGSKNS);
+ ASSERT_GE(test_fd, 0);
+
+ struct stat st_test;
+ ASSERT_EQ(fstat(test_fd, &st_test), 0);
+ ASSERT_EQ(st_test.st_ino, netns_ino);
+
+ close(test_fd);
+ close(netns_fd);
+
+ /* Close socket - namespace should become inactive */
+ close(sock_fd);
+}
+
+/*
+ * Test IPv6 sockets also work with SIOCGSKNS.
+ */
+TEST(siocgskns_ipv6)
+{
+ int sock_fd, netns_fd, current_netns_fd;
+ struct stat st1, st2;
+
+ /* Create an IPv6 TCP socket */
+ sock_fd = socket(AF_INET6, SOCK_STREAM, 0);
+ ASSERT_GE(sock_fd, 0);
+
+ /* Use SIOCGSKNS */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Verify it matches current namespace */
+ current_netns_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(current_netns_fd, 0);
+
+ ASSERT_EQ(fstat(netns_fd, &st1), 0);
+ ASSERT_EQ(fstat(current_netns_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ close(sock_fd);
+ close(netns_fd);
+ close(current_netns_fd);
+}
+
+/*
+ * Test that socket-kept netns appears in listns() output.
+ * Verify that a network namespace kept alive by a socket FD appears in
+ * listns() output even after the creating process exits, and that it
+ * disappears when the socket is closed.
+ */
+TEST(siocgskns_listns_visibility)
+{
+ int sock_fd, netns_fd, owner_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ __u64 netns_id, owner_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int ret, i;
+ bool found_netns = false;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new netns and socket */
+ close(ipc_sockets[0]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Get namespace ID */
+ ret = ioctl(netns_fd, NS_GET_ID, &netns_id);
+ if (ret < 0) {
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Get owner user namespace */
+ owner_fd = ioctl(netns_fd, NS_GET_USERNS);
+ if (owner_fd < 0) {
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_USERNS not supported");
+ ASSERT_GE(owner_fd, 0);
+ }
+
+ /* Get owner namespace ID */
+ ret = ioctl(owner_fd, NS_GET_ID, &owner_id);
+ if (ret < 0) {
+ close(owner_fd);
+ close(sock_fd);
+ close(netns_fd);
+ ASSERT_EQ(ret, 0);
+ }
+ close(owner_fd);
+
+ /* Namespace should appear in listns() output */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ /* Search for our network namespace in the list */
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id) {
+ found_netns = true;
+ break;
+ }
+ }
+
+ ASSERT_TRUE(found_netns);
+ TH_LOG("Found netns %llu in listns() output (kept alive by socket)", netns_id);
+
+ /* Now verify with owner filtering */
+ req.user_ns_id = owner_id;
+ found_netns = false;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id) {
+ found_netns = true;
+ break;
+ }
+ }
+
+ ASSERT_TRUE(found_netns);
+ TH_LOG("Found netns %llu owned by userns %llu", netns_id, owner_id);
+
+ /* Close socket - namespace should become inactive and disappear from listns() */
+ close(sock_fd);
+ close(netns_fd);
+
+ /* Verify it's no longer in listns() output */
+ req.user_ns_id = 0;
+ found_netns = false;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id) {
+ found_netns = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found_netns);
+ TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id);
+}
+
+/*
+ * Test that socket-kept netns can be reopened via file handle.
+ * Verify that a network namespace kept alive by a socket FD can be
+ * reopened using file handles even after the creating process exits.
+ */
+TEST(siocgskns_file_handle)
+{
+ int sock_fd, netns_fd, reopened_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ struct stat st1, st2;
+ ino_t netns_ino;
+ __u64 netns_id;
+ struct file_handle *handle;
+ struct nsfs_file_handle *nsfs_fh;
+ int ret;
+
+ /* Allocate file_handle structure for nsfs */
+ handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle));
+ ASSERT_NE(handle, NULL);
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new netns and socket */
+ close(ipc_sockets[0]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ ASSERT_EQ(fstat(netns_fd, &st1), 0);
+ netns_ino = st1.st_ino;
+
+ /* Get namespace ID */
+ ret = ioctl(netns_fd, NS_GET_ID, &netns_id);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Construct file handle from namespace ID */
+ nsfs_fh = (struct nsfs_file_handle *)handle->f_handle;
+ nsfs_fh->ns_id = netns_id;
+ nsfs_fh->ns_type = 0; /* Type field not needed for reopening */
+ nsfs_fh->ns_inum = 0; /* Inum field not needed for reopening */
+
+ TH_LOG("Constructed file handle for netns %lu (id=%llu)", netns_ino, netns_id);
+
+ /* Reopen namespace using file handle (while socket still keeps it alive) */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ /* Verify it's the same namespace */
+ ASSERT_EQ(fstat(reopened_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ TH_LOG("Successfully reopened netns %lu via file handle", netns_ino);
+
+ close(reopened_fd);
+
+ /* Close the netns FD */
+ close(netns_fd);
+
+ /* Try to reopen via file handle - should fail since namespace is now inactive */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(reopened_fd, 0);
+ TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno));
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Reopen namespace using file handle (while socket still keeps it alive) */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ /* Verify it's the same namespace */
+ ASSERT_EQ(fstat(reopened_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ TH_LOG("Successfully reopened netns %lu via file handle", netns_ino);
+
+ /* Close socket - namespace should become inactive */
+ close(sock_fd);
+ free(handle);
+}
+
+/*
+ * Test combined listns() and file handle operations with socket-kept netns.
+ * Create a netns, keep it alive with a socket, verify it appears in listns(),
+ * then reopen it via file handle obtained from listns() entry.
+ */
+TEST(siocgskns_listns_and_file_handle)
+{
+ int sock_fd, netns_fd, userns_fd, reopened_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ struct stat st;
+ ino_t netns_ino;
+ __u64 netns_id, userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET | CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int ret, i;
+ bool found_netns = false, found_userns = false;
+ struct file_handle *handle;
+ struct nsfs_file_handle *nsfs_fh;
+
+ /* Allocate file_handle structure for nsfs */
+ handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle));
+ ASSERT_NE(handle, NULL);
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new userns and netns with socket */
+ close(ipc_sockets[0]);
+
+ if (setup_userns() < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ ASSERT_EQ(fstat(netns_fd, &st), 0);
+ netns_ino = st.st_ino;
+
+ /* Get namespace ID */
+ ret = ioctl(netns_fd, NS_GET_ID, &netns_id);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Get owner user namespace */
+ userns_fd = ioctl(netns_fd, NS_GET_USERNS);
+ if (userns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_USERNS not supported");
+ ASSERT_GE(userns_fd, 0);
+ }
+
+ /* Get owner namespace ID */
+ ret = ioctl(userns_fd, NS_GET_ID, &userns_id);
+ if (ret < 0) {
+ close(userns_fd);
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ ASSERT_EQ(ret, 0);
+ }
+ close(userns_fd);
+
+ TH_LOG("Testing netns %lu (id=%llu) owned by userns id=%llu", netns_ino, netns_id, userns_id);
+
+ /* Verify namespace appears in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ found_netns = false;
+ found_userns = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id)
+ found_netns = true;
+ if (ns_ids[i] == userns_id)
+ found_userns = true;
+ }
+ ASSERT_TRUE(found_netns);
+ ASSERT_TRUE(found_userns);
+ TH_LOG("Found netns %llu in listns() output", netns_id);
+
+ /* Construct file handle from namespace ID */
+ nsfs_fh = (struct nsfs_file_handle *)handle->f_handle;
+ nsfs_fh->ns_id = netns_id;
+ nsfs_fh->ns_type = 0;
+ nsfs_fh->ns_inum = 0;
+
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ struct stat reopened_st;
+ ASSERT_EQ(fstat(reopened_fd, &reopened_st), 0);
+ ASSERT_EQ(reopened_st.st_ino, netns_ino);
+
+ TH_LOG("Successfully reopened netns %lu via file handle (socket-kept)", netns_ino);
+
+ close(reopened_fd);
+ close(netns_fd);
+
+ /* Try to reopen via file handle - should fail since namespace is now inactive */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(reopened_fd, 0);
+ TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno));
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Verify namespace appears in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ found_netns = false;
+ found_userns = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id)
+ found_netns = true;
+ if (ns_ids[i] == userns_id)
+ found_userns = true;
+ }
+ ASSERT_TRUE(found_netns);
+ ASSERT_TRUE(found_userns);
+ TH_LOG("Found netns %llu in listns() output", netns_id);
+
+ close(netns_fd);
+
+ /* Verify namespace appears in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ found_netns = false;
+ found_userns = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id)
+ found_netns = true;
+ if (ns_ids[i] == userns_id)
+ found_userns = true;
+ }
+ ASSERT_FALSE(found_netns);
+ ASSERT_FALSE(found_userns);
+ TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id);
+
+ close(sock_fd);
+ free(handle);
+}
+
+/*
+ * Test multi-level namespace resurrection across three user namespace levels.
+ *
+ * This test creates a complex namespace hierarchy with three levels of user
+ * namespaces and a network namespace at the deepest level. It verifies that
+ * the resurrection semantics work correctly when SIOCGSKNS is called on a
+ * socket from an inactive namespace tree, and that listns() and
+ * open_by_handle_at() correctly respect visibility rules.
+ *
+ * Hierarchy after child processes exit (all with 0 active refcount):
+ *
+ * net_L3A (0) <- Level 3 network namespace
+ * |
+ * +
+ * userns_L3 (0) <- Level 3 user namespace
+ * |
+ * +
+ * userns_L2 (0) <- Level 2 user namespace
+ * |
+ * +
+ * userns_L1 (0) <- Level 1 user namespace
+ * |
+ * x
+ * init_user_ns
+ *
+ * The test verifies:
+ * 1. SIOCGSKNS on a socket from inactive net_L3A resurrects the entire chain
+ * 2. After resurrection, all namespaces are visible in listns()
+ * 3. Resurrected namespaces can be reopened via file handles
+ * 4. Closing the netns FD cascades down: the entire ownership chain
+ * (userns_L3 -> userns_L2 -> userns_L1) becomes inactive again
+ * 5. Inactive namespaces disappear from listns() and cannot be reopened
+ * 6. Calling SIOCGSKNS again on the same socket resurrects the tree again
+ * 7. After second resurrection, namespaces are visible and can be reopened
+ */
+TEST(siocgskns_multilevel_resurrection)
+{
+ int ipc_sockets[2];
+ pid_t pid_l1, pid_l2, pid_l3;
+ int status;
+
+ /* Namespace file descriptors to be received from child */
+ int sock_L3A_fd = -1;
+ int netns_L3A_fd = -1;
+ __u64 netns_L3A_id;
+ __u64 userns_L1_id, userns_L2_id, userns_L3_id;
+
+ /* For listns() and file handle testing */
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET | CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int ret, i;
+ struct file_handle *handle;
+ struct nsfs_file_handle *nsfs_fh;
+ int reopened_fd;
+
+ /* Allocate file handle for testing */
+ handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle));
+ ASSERT_NE(handle, NULL);
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ /*
+ * Fork level 1 child that creates userns_L1
+ */
+ pid_l1 = fork();
+ ASSERT_GE(pid_l1, 0);
+
+ if (pid_l1 == 0) {
+ /* Level 1 child */
+ int ipc_L2[2];
+ close(ipc_sockets[0]);
+
+ /* Create userns_L1 */
+ if (setup_userns() < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Create socketpair for communicating with L2 child */
+ if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L2) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /*
+ * Fork level 2 child that creates userns_L2
+ */
+ pid_l2 = fork();
+ if (pid_l2 < 0) {
+ close(ipc_sockets[1]);
+ close(ipc_L2[0]);
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ if (pid_l2 == 0) {
+ /* Level 2 child */
+ int ipc_L3[2];
+ close(ipc_L2[0]);
+
+ /* Create userns_L2 (nested inside userns_L1) */
+ if (setup_userns() < 0) {
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ /* Create socketpair for communicating with L3 child */
+ if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L3) < 0) {
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ /*
+ * Fork level 3 child that creates userns_L3 and network namespaces
+ */
+ pid_l3 = fork();
+ if (pid_l3 < 0) {
+ close(ipc_L2[1]);
+ close(ipc_L3[0]);
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ if (pid_l3 == 0) {
+ /* Level 3 child - the deepest level */
+ int sock_fd;
+ close(ipc_L3[0]);
+
+ /* Create userns_L3 (nested inside userns_L2) */
+ if (setup_userns() < 0) {
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ /* Create network namespace at level 3 */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ /* Create socket in net_L3A */
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to L2 parent */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_L3[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_L3[1]);
+ exit(0);
+ }
+
+ /* Level 2 child - receive from L3 and forward to L1 */
+ close(ipc_L3[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+ int received_fd;
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_L3[0], &msg, 0);
+ close(ipc_L3[0]);
+
+ if (n != 1) {
+ close(ipc_L2[1]);
+ waitpid(pid_l3, NULL, 0);
+ exit(1);
+ }
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg) {
+ close(ipc_L2[1]);
+ waitpid(pid_l3, NULL, 0);
+ exit(1);
+ }
+ memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for L3 child */
+ waitpid(pid_l3, NULL, 0);
+
+ /* Forward the socket FD to L1 parent */
+ memset(&msg, 0, sizeof(msg));
+ buf[0] = 'Y';
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int));
+
+ if (sendmsg(ipc_L2[1], &msg, 0) < 0) {
+ close(received_fd);
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ close(received_fd);
+ close(ipc_L2[1]);
+ exit(0);
+ }
+
+ /* Level 1 child - receive from L2 and forward to parent */
+ close(ipc_L2[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+ int received_fd;
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_L2[0], &msg, 0);
+ close(ipc_L2[0]);
+
+ if (n != 1) {
+ close(ipc_sockets[1]);
+ waitpid(pid_l2, NULL, 0);
+ exit(1);
+ }
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg) {
+ close(ipc_sockets[1]);
+ waitpid(pid_l2, NULL, 0);
+ exit(1);
+ }
+ memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for L2 child */
+ waitpid(pid_l2, NULL, 0);
+
+ /* Forward the socket FD to parent */
+ memset(&msg, 0, sizeof(msg));
+ buf[0] = 'Z';
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(received_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(received_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent - receive the socket from the deepest level */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+
+ if (n != 1) {
+ free(handle);
+ waitpid(pid_l1, NULL, 0);
+ SKIP(return, "Failed to receive socket from child");
+ }
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg) {
+ free(handle);
+ waitpid(pid_l1, NULL, 0);
+ SKIP(return, "Failed to receive socket from child");
+ }
+ memcpy(&sock_L3A_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for L1 child */
+ waitpid(pid_l1, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /*
+ * At this point, all child processes have exited. The socket itself
+ * doesn't keep the namespace active - we need to call SIOCGSKNS which
+ * will resurrect the entire namespace tree by taking active references.
+ */
+
+ /* Get network namespace from socket - this resurrects the tree */
+ netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS);
+ if (netns_L3A_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_L3A_fd, 0);
+ }
+
+ /* Get namespace ID for net_L3A */
+ ret = ioctl(netns_L3A_fd, NS_GET_ID, &netns_L3A_id);
+ if (ret < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Get owner user namespace chain: userns_L3 -> userns_L2 -> userns_L1 */
+ int userns_L3_fd = ioctl(netns_L3A_fd, NS_GET_USERNS);
+ if (userns_L3_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_USERNS not supported");
+ ASSERT_GE(userns_L3_fd, 0);
+ }
+
+ ret = ioctl(userns_L3_fd, NS_GET_ID, &userns_L3_id);
+ ASSERT_EQ(ret, 0);
+
+ int userns_L2_fd = ioctl(userns_L3_fd, NS_GET_USERNS);
+ ASSERT_GE(userns_L2_fd, 0);
+ ret = ioctl(userns_L2_fd, NS_GET_ID, &userns_L2_id);
+ ASSERT_EQ(ret, 0);
+
+ int userns_L1_fd = ioctl(userns_L2_fd, NS_GET_USERNS);
+ ASSERT_GE(userns_L1_fd, 0);
+ ret = ioctl(userns_L1_fd, NS_GET_ID, &userns_L1_id);
+ ASSERT_EQ(ret, 0);
+
+ close(userns_L1_fd);
+ close(userns_L2_fd);
+ close(userns_L3_fd);
+
+ TH_LOG("Multi-level hierarchy: net_L3A (id=%llu) -> userns_L3 (id=%llu) -> userns_L2 (id=%llu) -> userns_L1 (id=%llu)",
+ netns_L3A_id, userns_L3_id, userns_L2_id, userns_L1_id);
+
+ /*
+ * Test 1: Verify net_L3A is visible in listns() after resurrection.
+ * The entire ownership chain should be resurrected and visible.
+ */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ bool found_netns_L3A = false;
+ bool found_userns_L1 = false;
+ bool found_userns_L2 = false;
+ bool found_userns_L3 = false;
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_L3A_id)
+ found_netns_L3A = true;
+ if (ns_ids[i] == userns_L1_id)
+ found_userns_L1 = true;
+ if (ns_ids[i] == userns_L2_id)
+ found_userns_L2 = true;
+ if (ns_ids[i] == userns_L3_id)
+ found_userns_L3 = true;
+ }
+
+ ASSERT_TRUE(found_netns_L3A);
+ ASSERT_TRUE(found_userns_L1);
+ ASSERT_TRUE(found_userns_L2);
+ ASSERT_TRUE(found_userns_L3);
+ TH_LOG("Resurrection verified: all namespaces in hierarchy visible in listns()");
+
+ /*
+ * Test 2: Verify net_L3A can be reopened via file handle.
+ */
+ nsfs_fh = (struct nsfs_file_handle *)handle->f_handle;
+ nsfs_fh->ns_id = netns_L3A_id;
+ nsfs_fh->ns_type = 0;
+ nsfs_fh->ns_inum = 0;
+
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ close(reopened_fd);
+ TH_LOG("File handle test passed: net_L3A can be reopened");
+
+ /*
+ * Test 3: Verify that when we close the netns FD (dropping the last
+ * active reference), the entire tree becomes inactive and disappears
+ * from listns(). The cascade goes: net_L3A drops -> userns_L3 drops ->
+ * userns_L2 drops -> userns_L1 drops.
+ */
+ close(netns_L3A_fd);
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found_netns_L3A = false;
+ found_userns_L1 = false;
+ found_userns_L2 = false;
+ found_userns_L3 = false;
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_L3A_id)
+ found_netns_L3A = true;
+ if (ns_ids[i] == userns_L1_id)
+ found_userns_L1 = true;
+ if (ns_ids[i] == userns_L2_id)
+ found_userns_L2 = true;
+ if (ns_ids[i] == userns_L3_id)
+ found_userns_L3 = true;
+ }
+
+ ASSERT_FALSE(found_netns_L3A);
+ ASSERT_FALSE(found_userns_L1);
+ ASSERT_FALSE(found_userns_L2);
+ ASSERT_FALSE(found_userns_L3);
+ TH_LOG("Cascade test passed: all namespaces disappeared after netns FD closed");
+
+ /*
+ * Test 4: Verify file handle no longer works for inactive namespace.
+ */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd >= 0) {
+ close(reopened_fd);
+ free(handle);
+ ASSERT_TRUE(false); /* Should have failed */
+ }
+ TH_LOG("Inactive namespace correctly cannot be reopened via file handle");
+
+ /*
+ * Test 5: Verify that calling SIOCGSKNS again resurrects the tree again.
+ * The socket is still valid, so we can call SIOCGSKNS on it to resurrect
+ * the namespace tree once more.
+ */
+ netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS);
+ ASSERT_GE(netns_L3A_fd, 0);
+
+ TH_LOG("Called SIOCGSKNS again to resurrect the namespace tree");
+
+ /* Verify the namespace tree is resurrected and visible in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found_netns_L3A = false;
+ found_userns_L1 = false;
+ found_userns_L2 = false;
+ found_userns_L3 = false;
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_L3A_id)
+ found_netns_L3A = true;
+ if (ns_ids[i] == userns_L1_id)
+ found_userns_L1 = true;
+ if (ns_ids[i] == userns_L2_id)
+ found_userns_L2 = true;
+ if (ns_ids[i] == userns_L3_id)
+ found_userns_L3 = true;
+ }
+
+ ASSERT_TRUE(found_netns_L3A);
+ ASSERT_TRUE(found_userns_L1);
+ ASSERT_TRUE(found_userns_L2);
+ ASSERT_TRUE(found_userns_L3);
+ TH_LOG("Second resurrection verified: all namespaces in hierarchy visible in listns() again");
+
+ /* Verify we can reopen via file handle again */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ TH_LOG("open_by_handle_at failed after second resurrection: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ close(reopened_fd);
+ TH_LOG("File handle test passed: net_L3A can be reopened after second resurrection");
+
+ /* Final cleanup */
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ free(handle);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/stress_test.c b/tools/testing/selftests/namespaces/stress_test.c
new file mode 100644
index 000000000000..dd7df7d6cb27
--- /dev/null
+++ b/tools/testing/selftests/namespaces/stress_test.c
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/nsfs.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Stress tests for namespace active reference counting.
+ *
+ * These tests validate that the active reference counting system can handle
+ * high load scenarios including rapid namespace creation/destruction, large
+ * numbers of concurrent namespaces, and various edge cases under stress.
+ */
+
+/*
+ * Test rapid creation and destruction of user namespaces.
+ * Create and destroy namespaces in quick succession to stress the
+ * active reference tracking and ensure no leaks occur.
+ */
+TEST(rapid_namespace_creation_destruction)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[256], ns_ids_after[256];
+ ssize_t ret_before, ret_after;
+ int i;
+
+ /* Get baseline count of active user namespaces */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active user namespaces", ret_before);
+
+ /* Rapidly create and destroy 100 user namespaces */
+ for (i = 0; i < 100; i++) {
+ pid_t pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create user namespace and immediately exit */
+ if (setup_userns() < 0)
+ exit(1);
+ exit(0);
+ }
+
+ /* Parent: wait for child */
+ int status;
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+ }
+
+ /* Verify we're back to baseline (no leaked namespaces) */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 100 rapid create/destroy cycles: %zd active user namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test creating many concurrent namespaces.
+ * Verify that listns() correctly tracks all of them and that they all
+ * become inactive after processes exit.
+ */
+TEST(many_concurrent_namespaces)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_during[512], ns_ids_after[512];
+ ssize_t ret_before, ret_during, ret_after;
+ pid_t pids[50];
+ int num_children = 50;
+ int i;
+ int sv[2];
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active user namespaces", ret_before);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create many children, each with their own user namespace */
+ for (i = 0; i < num_children; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ /* Child: create user namespace and wait for parent signal */
+ char c;
+
+ close(sv[0]);
+
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Signal parent we're ready */
+ if (write(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal to exit */
+ if (read(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ close(sv[1]);
+ exit(0);
+ }
+ }
+
+ close(sv[1]);
+
+ /* Wait for all children to signal ready */
+ for (i = 0; i < num_children; i++) {
+ char c;
+ if (read(sv[0], &c, 1) != 1) {
+ /* If we fail to read, kill all children and exit */
+ close(sv[0]);
+ for (int j = 0; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ /* List namespaces while all children are running */
+ ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0);
+ ASSERT_GE(ret_during, 0);
+
+ TH_LOG("With %d children running: %zd active user namespaces", num_children, ret_during);
+
+ /* Should have at least num_children more namespaces than baseline */
+ ASSERT_GE(ret_during, ret_before + num_children);
+
+ /* Signal all children to exit */
+ for (i = 0; i < num_children; i++) {
+ char c = 'X';
+ if (write(sv[0], &c, 1) != 1) {
+ /* If we fail to write, kill remaining children */
+ close(sv[0]);
+ for (int j = i; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ close(sv[0]);
+
+ /* Wait for all children */
+ for (i = 0; i < num_children; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After all children exit: %zd active user namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test rapid namespace creation with different namespace types.
+ * Create multiple types of namespaces rapidly to stress the tracking system.
+ */
+TEST(rapid_mixed_namespace_creation)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ int i;
+
+ /* Get baseline count */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active namespaces (all types)", ret_before);
+
+ /* Rapidly create and destroy namespaces with multiple types */
+ for (i = 0; i < 50; i++) {
+ pid_t pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create multiple namespace types */
+ if (setup_userns() < 0)
+ exit(1);
+
+ /* Create additional namespace types */
+ if (unshare(CLONE_NEWNET) < 0)
+ exit(1);
+ if (unshare(CLONE_NEWUTS) < 0)
+ exit(1);
+ if (unshare(CLONE_NEWIPC) < 0)
+ exit(1);
+
+ exit(0);
+ }
+
+ /* Parent: wait for child */
+ int status;
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 50 rapid mixed namespace cycles: %zd active namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test nested namespace creation under stress.
+ * Create deeply nested namespace hierarchies and verify proper cleanup.
+ */
+TEST(nested_namespace_stress)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ int i;
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active user namespaces", ret_before);
+
+ /* Create 20 processes, each with nested user namespaces */
+ for (i = 0; i < 20; i++) {
+ pid_t pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int userns_fd;
+ uid_t orig_uid = getuid();
+ int depth;
+
+ /* Create nested user namespaces (up to 5 levels) */
+ for (depth = 0; depth < 5; depth++) {
+ userns_fd = get_userns_fd(0, (depth == 0) ? orig_uid : 0, 1);
+ if (userns_fd < 0)
+ exit(1);
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ exit(1);
+ }
+ close(userns_fd);
+ }
+
+ exit(0);
+ }
+
+ /* Parent: wait for child */
+ int status;
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 20 nested namespace hierarchies: %zd active user namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test listns() pagination under stress.
+ * Create many namespaces and verify pagination works correctly.
+ */
+TEST(listns_pagination_stress)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ pid_t pids[30];
+ int num_children = 30;
+ int i;
+ int sv[2];
+ __u64 all_ns_ids[512];
+ int total_found = 0;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create many children with user namespaces */
+ for (i = 0; i < num_children; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ char c;
+ close(sv[0]);
+
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Signal parent we're ready */
+ if (write(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal to exit */
+ if (read(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ close(sv[1]);
+ exit(0);
+ }
+ }
+
+ close(sv[1]);
+
+ /* Wait for all children to signal ready */
+ for (i = 0; i < num_children; i++) {
+ char c;
+ if (read(sv[0], &c, 1) != 1) {
+ /* If we fail to read, kill all children and exit */
+ close(sv[0]);
+ for (int j = 0; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ /* Paginate through all namespaces using small batch sizes */
+ req.ns_id = 0;
+ while (1) {
+ __u64 batch[5]; /* Small batch size to force pagination */
+ ssize_t ret;
+
+ ret = sys_listns(&req, batch, ARRAY_SIZE(batch), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS) {
+ close(sv[0]);
+ for (i = 0; i < num_children; i++)
+ kill(pids[i], SIGKILL);
+ for (i = 0; i < num_children; i++)
+ waitpid(pids[i], NULL, 0);
+ SKIP(return, "listns() not supported");
+ }
+ ASSERT_GE(ret, 0);
+ }
+
+ if (ret == 0)
+ break;
+
+ /* Store results */
+ for (i = 0; i < ret && total_found < 512; i++) {
+ all_ns_ids[total_found++] = batch[i];
+ }
+
+ /* Update cursor for next batch */
+ if (ret == ARRAY_SIZE(batch))
+ req.ns_id = batch[ret - 1];
+ else
+ break;
+ }
+
+ TH_LOG("Paginated through %d user namespaces", total_found);
+
+ /* Verify no duplicates in pagination */
+ for (i = 0; i < total_found; i++) {
+ for (int j = i + 1; j < total_found; j++) {
+ if (all_ns_ids[i] == all_ns_ids[j]) {
+ TH_LOG("Found duplicate ns_id: %llu at positions %d and %d",
+ (unsigned long long)all_ns_ids[i], i, j);
+ ASSERT_TRUE(false);
+ }
+ }
+ }
+
+ /* Signal all children to exit */
+ for (i = 0; i < num_children; i++) {
+ char c = 'X';
+ if (write(sv[0], &c, 1) != 1) {
+ close(sv[0]);
+ for (int j = i; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ close(sv[0]);
+
+ /* Wait for all children */
+ for (i = 0; i < num_children; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ }
+}
+
+/*
+ * Test concurrent namespace operations.
+ * Multiple processes creating, querying, and destroying namespaces concurrently.
+ */
+TEST(concurrent_namespace_operations)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ pid_t pids[20];
+ int num_workers = 20;
+ int i;
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active namespaces", ret_before);
+
+ /* Create worker processes that do concurrent operations */
+ for (i = 0; i < num_workers; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ /* Each worker: create namespaces, list them, repeat */
+ int iterations;
+
+ for (iterations = 0; iterations < 10; iterations++) {
+ int userns_fd;
+ __u64 temp_ns_ids[100];
+ ssize_t ret;
+
+ /* Create a user namespace */
+ userns_fd = get_userns_fd(0, getuid(), 1);
+ if (userns_fd < 0)
+ continue;
+
+ /* List namespaces */
+ ret = sys_listns(&req, temp_ns_ids, ARRAY_SIZE(temp_ns_ids), 0);
+ (void)ret;
+
+ close(userns_fd);
+
+ /* Small delay */
+ usleep(1000);
+ }
+
+ exit(0);
+ }
+ }
+
+ /* Wait for all workers */
+ for (i = 0; i < num_workers; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After concurrent operations: %zd active namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test namespace churn - continuous creation and destruction.
+ * Simulates high-churn scenarios like container orchestration.
+ */
+TEST(namespace_churn)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ int cycle;
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active namespaces", ret_before);
+
+ /* Simulate churn: batches of namespaces created and destroyed */
+ for (cycle = 0; cycle < 10; cycle++) {
+ pid_t batch_pids[10];
+ int i;
+
+ /* Create batch */
+ for (i = 0; i < 10; i++) {
+ batch_pids[i] = fork();
+ ASSERT_GE(batch_pids[i], 0);
+
+ if (batch_pids[i] == 0) {
+ /* Create multiple namespace types */
+ if (setup_userns() < 0)
+ exit(1);
+ if (unshare(CLONE_NEWNET) < 0)
+ exit(1);
+ if (unshare(CLONE_NEWUTS) < 0)
+ exit(1);
+
+ /* Keep namespaces alive briefly */
+ usleep(10000);
+ exit(0);
+ }
+ }
+
+ /* Wait for batch to complete */
+ for (i = 0; i < 10; i++) {
+ int status;
+ waitpid(batch_pids[i], &status, 0);
+ }
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 10 churn cycles (100 namespace sets): %zd active namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/wrappers.h b/tools/testing/selftests/namespaces/wrappers.h
new file mode 100644
index 000000000000..9741a64a5b1d
--- /dev/null
+++ b/tools/testing/selftests/namespaces/wrappers.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/nsfs.h>
+#include <linux/types.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#ifndef __SELFTESTS_NAMESPACES_WRAPPERS_H__
+#define __SELFTESTS_NAMESPACES_WRAPPERS_H__
+
+#ifndef __NR_listns
+ #if defined __alpha__
+ #define __NR_listns 580
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_listns 4470
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_listns 6470
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_listns 5470
+ #endif
+ #else
+ #define __NR_listns 470
+ #endif
+#endif
+
+static inline int sys_listns(const struct ns_id_req *req, __u64 *ns_ids,
+ size_t nr_ns_ids, unsigned int flags)
+{
+ return syscall(__NR_listns, req, ns_ids, nr_ns_ids, flags);
+}
+
+#endif /* __SELFTESTS_NAMESPACES_WRAPPERS_H__ */
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 439101b518ee..8f9850a71f54 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -45,6 +45,7 @@ skf_net_off
socket
so_incoming_cpu
so_netns_cookie
+so_peek_off
so_txtime
so_rcv_listener
stress_reuseport_listen
diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile
index de805cbbdf69..528d14c598bb 100644
--- a/tools/testing/selftests/net/af_unix/Makefile
+++ b/tools/testing/selftests/net/af_unix/Makefile
@@ -6,6 +6,7 @@ TEST_GEN_PROGS := \
scm_inq \
scm_pidfd \
scm_rights \
+ so_peek_off \
unix_connect \
# end of TEST_GEN_PROGS
diff --git a/tools/testing/selftests/net/af_unix/so_peek_off.c b/tools/testing/selftests/net/af_unix/so_peek_off.c
new file mode 100644
index 000000000000..1a77728128e5
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/so_peek_off.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2025 Google LLC */
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <sys/socket.h>
+
+#include "../../kselftest_harness.h"
+
+FIXTURE(so_peek_off)
+{
+ int fd[2]; /* 0: sender, 1: receiver */
+};
+
+FIXTURE_VARIANT(so_peek_off)
+{
+ int type;
+};
+
+FIXTURE_VARIANT_ADD(so_peek_off, stream)
+{
+ .type = SOCK_STREAM,
+};
+
+FIXTURE_VARIANT_ADD(so_peek_off, dgram)
+{
+ .type = SOCK_DGRAM,
+};
+
+FIXTURE_VARIANT_ADD(so_peek_off, seqpacket)
+{
+ .type = SOCK_SEQPACKET,
+};
+
+FIXTURE_SETUP(so_peek_off)
+{
+ struct timeval timeout = {
+ .tv_sec = 0,
+ .tv_usec = 3000,
+ };
+ int ret;
+
+ ret = socketpair(AF_UNIX, variant->type, 0, self->fd);
+ ASSERT_EQ(0, ret);
+
+ ret = setsockopt(self->fd[1], SOL_SOCKET, SO_RCVTIMEO_NEW,
+ &timeout, sizeof(timeout));
+ ASSERT_EQ(0, ret);
+
+ ret = setsockopt(self->fd[1], SOL_SOCKET, SO_PEEK_OFF,
+ &(int){0}, sizeof(int));
+ ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(so_peek_off)
+{
+ close_range(self->fd[0], self->fd[1], 0);
+}
+
+#define sendeq(fd, str, flags) \
+ do { \
+ int bytes, len = strlen(str); \
+ \
+ bytes = send(fd, str, len, flags); \
+ ASSERT_EQ(len, bytes); \
+ } while (0)
+
+#define recveq(fd, str, buflen, flags) \
+ do { \
+ char buf[(buflen) + 1] = {}; \
+ int bytes; \
+ \
+ bytes = recv(fd, buf, buflen, flags); \
+ ASSERT_NE(-1, bytes); \
+ ASSERT_STREQ(str, buf); \
+ } while (0)
+
+#define async \
+ for (pid_t pid = (pid = fork(), \
+ pid < 0 ? \
+ __TH_LOG("Failed to start async {}"), \
+ _metadata->exit_code = KSFT_FAIL, \
+ __bail(1, _metadata), \
+ 0xdead : \
+ pid); \
+ !pid; exit(0))
+
+TEST_F(so_peek_off, single_chunk)
+{
+ sendeq(self->fd[0], "aaaabbbb", 0);
+
+ recveq(self->fd[1], "aaaa", 4, MSG_PEEK);
+ recveq(self->fd[1], "bbbb", 100, MSG_PEEK);
+}
+
+TEST_F(so_peek_off, two_chunks)
+{
+ sendeq(self->fd[0], "aaaa", 0);
+ sendeq(self->fd[0], "bbbb", 0);
+
+ recveq(self->fd[1], "aaaa", 4, MSG_PEEK);
+ recveq(self->fd[1], "bbbb", 100, MSG_PEEK);
+}
+
+TEST_F(so_peek_off, two_chunks_blocking)
+{
+ async {
+ usleep(1000);
+ sendeq(self->fd[0], "aaaa", 0);
+ }
+
+ recveq(self->fd[1], "aaaa", 4, MSG_PEEK);
+
+ async {
+ usleep(1000);
+ sendeq(self->fd[0], "bbbb", 0);
+ }
+
+ /* goto again; -> goto redo; in unix_stream_read_generic(). */
+ recveq(self->fd[1], "bbbb", 100, MSG_PEEK);
+}
+
+TEST_F(so_peek_off, two_chunks_overlap)
+{
+ sendeq(self->fd[0], "aaaa", 0);
+ recveq(self->fd[1], "aa", 2, MSG_PEEK);
+
+ sendeq(self->fd[0], "bbbb", 0);
+
+ if (variant->type == SOCK_STREAM) {
+ /* SOCK_STREAM tries to fill the buffer. */
+ recveq(self->fd[1], "aabb", 4, MSG_PEEK);
+ recveq(self->fd[1], "bb", 100, MSG_PEEK);
+ } else {
+ /* SOCK_DGRAM and SOCK_SEQPACKET returns at the skb boundary. */
+ recveq(self->fd[1], "aa", 100, MSG_PEEK);
+ recveq(self->fd[1], "bbbb", 100, MSG_PEEK);
+ }
+}
+
+TEST_F(so_peek_off, two_chunks_overlap_blocking)
+{
+ async {
+ usleep(1000);
+ sendeq(self->fd[0], "aaaa", 0);
+ }
+
+ recveq(self->fd[1], "aa", 2, MSG_PEEK);
+
+ async {
+ usleep(1000);
+ sendeq(self->fd[0], "bbbb", 0);
+ }
+
+ /* Even SOCK_STREAM does not wait if at least one byte is read. */
+ recveq(self->fd[1], "aa", 100, MSG_PEEK);
+
+ recveq(self->fd[1], "bbbb", 100, MSG_PEEK);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/bareudp.sh b/tools/testing/selftests/net/bareudp.sh
index 4046131e7888..d9e5b967f815 100755
--- a/tools/testing/selftests/net/bareudp.sh
+++ b/tools/testing/selftests/net/bareudp.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Test various bareudp tunnel configurations.
diff --git a/tools/testing/selftests/net/forwarding/lib_sh_test.sh b/tools/testing/selftests/net/forwarding/lib_sh_test.sh
index ff2accccaf4d..b4eda6c6199e 100755
--- a/tools/testing/selftests/net/forwarding/lib_sh_test.sh
+++ b/tools/testing/selftests/net/forwarding/lib_sh_test.sh
@@ -30,6 +30,11 @@ tfail()
do_test "tfail" false
}
+tfail2()
+{
+ do_test "tfail2" false
+}
+
txfail()
{
FAIL_TO_XFAIL=yes do_test "txfail" false
@@ -132,6 +137,8 @@ test_ret()
ret_subtest $ksft_fail "tfail" txfail tfail
ret_subtest $ksft_xfail "txfail" txfail txfail
+
+ ret_subtest $ksft_fail "tfail2" tfail2 tfail
}
exit_status_tests_run()
diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh
index ecd34f364125..892895659c7e 100755
--- a/tools/testing/selftests/net/forwarding/local_termination.sh
+++ b/tools/testing/selftests/net/forwarding/local_termination.sh
@@ -176,6 +176,8 @@ run_test()
local rcv_dmac=$(mac_get $rcv_if_name)
local should_receive
+ setup_wait
+
tcpdump_start $rcv_if_name
mc_route_prepare $send_if_name
diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c
index 2b1d9f2b3e9e..cfc39f70635d 100644
--- a/tools/testing/selftests/net/gro.c
+++ b/tools/testing/selftests/net/gro.c
@@ -754,11 +754,11 @@ static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1,
static char exthdr_pck[sizeof(buf) + MIN_EXTHDR_SIZE];
create_packet(buf, 0, 0, PAYLOAD_LEN, 0);
- add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS, ext_data1);
+ add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_DSTOPTS, ext_data1);
write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr);
create_packet(buf, PAYLOAD_LEN * 1, 0, PAYLOAD_LEN, 0);
- add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS, ext_data2);
+ add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_DSTOPTS, ext_data2);
write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr);
}
@@ -989,6 +989,7 @@ static void check_recv_pkts(int fd, int *correct_payload,
static void gro_sender(void)
{
+ const int fin_delay_us = 100 * 1000;
static char fin_pkt[MAX_HDR_LEN];
struct sockaddr_ll daddr = {};
int txfd = -1;
@@ -1032,15 +1033,22 @@ static void gro_sender(void)
write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
} else if (strcmp(testname, "tcp") == 0) {
send_changed_checksum(txfd, &daddr);
+ /* Adding sleep before sending FIN so that it is not
+ * received prior to other packets.
+ */
+ usleep(fin_delay_us);
write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
send_changed_seq(txfd, &daddr);
+ usleep(fin_delay_us);
write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
send_changed_ts(txfd, &daddr);
+ usleep(fin_delay_us);
write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
send_diff_opt(txfd, &daddr);
+ usleep(fin_delay_us);
write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
} else if (strcmp(testname, "ip") == 0) {
send_changed_ECN(txfd, &daddr);
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index feba4ef69a54..f448bafb3f20 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -43,7 +43,7 @@ __ksft_status_merge()
weights[$i]=$((weight++))
done
- if [[ ${weights[$a]} > ${weights[$b]} ]]; then
+ if [[ ${weights[$a]} -ge ${weights[$b]} ]]; then
echo "$a"
return 0
else
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index b148cadb96d0..fc7e22b503d3 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -710,8 +710,14 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd,
bw = do_rnd_write(peerfd, winfo->buf + winfo->off, winfo->len);
if (bw < 0) {
- if (cfg_rcv_trunc)
- return 0;
+ /* expected reset, continue to read */
+ if (cfg_rcv_trunc &&
+ (errno == ECONNRESET ||
+ errno == EPIPE)) {
+ fds.events &= ~POLLOUT;
+ continue;
+ }
+
perror("write");
return 111;
}
@@ -737,8 +743,10 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd,
}
if (fds.revents & (POLLERR | POLLNVAL)) {
- if (cfg_rcv_trunc)
- return 0;
+ if (cfg_rcv_trunc) {
+ fds.events &= ~(POLLERR | POLLNVAL);
+ continue;
+ }
fprintf(stderr, "Unexpected revents: "
"POLLERR/POLLNVAL(%x)\n", fds.revents);
return 5;
@@ -1433,7 +1441,7 @@ static void parse_opts(int argc, char **argv)
*/
if (cfg_truncate < 0) {
cfg_rcv_trunc = true;
- signal(SIGPIPE, handle_signal);
+ signal(SIGPIPE, SIG_IGN);
}
break;
case 'j':
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index 47ecb5b3836e..9b7b93f8eb0c 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -492,7 +492,7 @@ do_transfer()
"than expected (${expect_synrx})"
retc=1
fi
- if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ] && [ ${stat_ooo_now} -eq 0 ]; then
+ if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ]; then
if [ ${stat_ooo_now} -eq 0 ]; then
mptcp_lib_pr_fail "lower MPC ACK rx (${stat_ackrx_now_l})" \
"than expected (${expect_ackrx})"
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 78a1aa4ecff2..43f31f8d587f 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -2532,7 +2532,7 @@ remove_tests()
if reset "remove single subflow"; then
pm_nl_set_limits $ns1 0 1
pm_nl_set_limits $ns2 0 1
- pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+ pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup
addr_nr_ns2=-1 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 1 1 1
@@ -2545,8 +2545,8 @@ remove_tests()
if reset "remove multiple subflows"; then
pm_nl_set_limits $ns1 0 2
pm_nl_set_limits $ns2 0 2
- pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
- pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+ pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,backup
+ pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup
addr_nr_ns2=-2 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 2 2 2
@@ -2557,7 +2557,7 @@ remove_tests()
# single address, remove
if reset "remove single address"; then
pm_nl_set_limits $ns1 0 1
- pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup
pm_nl_set_limits $ns2 1 1
addr_nr_ns1=-1 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
@@ -2570,9 +2570,9 @@ remove_tests()
# subflow and signal, remove
if reset "remove subflow and signal"; then
pm_nl_set_limits $ns1 0 2
- pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup
pm_nl_set_limits $ns2 1 2
- pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+ pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup
addr_nr_ns1=-1 addr_nr_ns2=-1 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 2 2 2
@@ -2584,10 +2584,10 @@ remove_tests()
# subflows and signal, remove
if reset "remove subflows and signal"; then
pm_nl_set_limits $ns1 0 3
- pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup
pm_nl_set_limits $ns2 1 3
- pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
- pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow
+ pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup
+ pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup
addr_nr_ns1=-1 addr_nr_ns2=-2 speed=10 \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 3 3 3
@@ -2599,9 +2599,9 @@ remove_tests()
# addresses remove
if reset "remove addresses"; then
pm_nl_set_limits $ns1 3 3
- pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250
- pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
- pm_nl_add_endpoint $ns1 10.0.4.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup id 250
+ pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup
+ pm_nl_add_endpoint $ns1 10.0.4.1 flags signal,backup
pm_nl_set_limits $ns2 3 3
addr_nr_ns1=-3 speed=10 \
run_tests $ns1 $ns2 10.0.1.1
@@ -2614,10 +2614,10 @@ remove_tests()
# invalid addresses remove
if reset "remove invalid addresses"; then
pm_nl_set_limits $ns1 3 3
- pm_nl_add_endpoint $ns1 10.0.12.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.12.1 flags signal,backup
# broadcast IP: no packet for this address will be received on ns1
- pm_nl_add_endpoint $ns1 224.0.0.1 flags signal
- pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
+ pm_nl_add_endpoint $ns1 224.0.0.1 flags signal,backup
+ pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup
pm_nl_set_limits $ns2 2 2
addr_nr_ns1=-3 speed=10 \
run_tests $ns1 $ns2 10.0.1.1
@@ -2631,10 +2631,10 @@ remove_tests()
# subflows and signal, flush
if reset "flush subflows and signal"; then
pm_nl_set_limits $ns1 0 3
- pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup
pm_nl_set_limits $ns2 1 3
- pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
- pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow
+ pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup
+ pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup
addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 3 3 3
@@ -2647,9 +2647,9 @@ remove_tests()
if reset "flush subflows"; then
pm_nl_set_limits $ns1 3 3
pm_nl_set_limits $ns2 3 3
- pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow id 150
- pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
- pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow
+ pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,backup id 150
+ pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup
+ pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup
addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 3 3 3
@@ -2666,9 +2666,9 @@ remove_tests()
# addresses flush
if reset "flush addresses"; then
pm_nl_set_limits $ns1 3 3
- pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250
- pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
- pm_nl_add_endpoint $ns1 10.0.4.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup id 250
+ pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup
+ pm_nl_add_endpoint $ns1 10.0.4.1 flags signal,backup
pm_nl_set_limits $ns2 3 3
addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
@@ -2681,9 +2681,9 @@ remove_tests()
# invalid addresses flush
if reset "flush invalid addresses"; then
pm_nl_set_limits $ns1 3 3
- pm_nl_add_endpoint $ns1 10.0.12.1 flags signal
- pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
- pm_nl_add_endpoint $ns1 10.0.14.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.12.1 flags signal,backup
+ pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup
+ pm_nl_add_endpoint $ns1 10.0.14.1 flags signal,backup
pm_nl_set_limits $ns2 3 3
addr_nr_ns1=-8 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
@@ -3500,7 +3500,6 @@ fullmesh_tests()
fastclose_tests()
{
if reset_check_counter "fastclose test" "MPTcpExtMPFastcloseTx"; then
- MPTCP_LIB_SUBTEST_FLAKY=1
test_linkfail=1024 fastclose=client \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 0 0 0
@@ -3509,7 +3508,6 @@ fastclose_tests()
fi
if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then
- MPTCP_LIB_SUBTEST_FLAKY=1
test_linkfail=1024 fastclose=server \
run_tests $ns1 $ns2 10.0.1.1
join_rst_nr=1 \
@@ -3806,7 +3804,7 @@ userspace_tests()
continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns1
pm_nl_set_limits $ns2 2 2
- { speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
wait_mpj $ns1
@@ -3831,7 +3829,7 @@ userspace_tests()
chk_mptcp_info subflows 0 subflows 0
chk_subflows_total 1 1
kill_events_pids
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
fi
# userspace pm create destroy subflow
@@ -3839,7 +3837,7 @@ userspace_tests()
continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns2
pm_nl_set_limits $ns1 0 1
- { speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
wait_mpj $ns2
@@ -3859,7 +3857,7 @@ userspace_tests()
chk_mptcp_info subflows 0 subflows 0
chk_subflows_total 1 1
kill_events_pids
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
fi
# userspace pm create id 0 subflow
@@ -3867,7 +3865,7 @@ userspace_tests()
continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns2
pm_nl_set_limits $ns1 0 1
- { speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
wait_mpj $ns2
@@ -3880,7 +3878,7 @@ userspace_tests()
chk_mptcp_info subflows 1 subflows 1
chk_subflows_total 2 2
kill_events_pids
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
fi
# userspace pm remove initial subflow
@@ -3888,7 +3886,7 @@ userspace_tests()
continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns2
pm_nl_set_limits $ns1 0 1
- { speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
wait_mpj $ns2
@@ -3904,7 +3902,7 @@ userspace_tests()
chk_mptcp_info subflows 1 subflows 1
chk_subflows_total 1 1
kill_events_pids
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
fi
# userspace pm send RM_ADDR for ID 0
@@ -3912,7 +3910,7 @@ userspace_tests()
continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns1
pm_nl_set_limits $ns2 1 1
- { speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
wait_mpj $ns1
@@ -3930,7 +3928,7 @@ userspace_tests()
chk_mptcp_info subflows 1 subflows 1
chk_subflows_total 1 1
kill_events_pids
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
fi
}
@@ -3943,7 +3941,7 @@ endpoint_tests()
pm_nl_set_limits $ns1 2 2
pm_nl_set_limits $ns2 2 2
pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
- { speed=slow \
+ { timeout_test=120 test_linkfail=128 speed=slow \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
@@ -3960,7 +3958,7 @@ endpoint_tests()
pm_nl_add_endpoint $ns2 10.0.2.2 flags signal
pm_nl_check_endpoint "modif is allowed" \
$ns2 10.0.2.2 id 1 flags signal
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
fi
if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT &&
@@ -3970,7 +3968,7 @@ endpoint_tests()
pm_nl_set_limits $ns2 0 3
pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow
pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow
- { test_linkfail=4 speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
@@ -4015,7 +4013,7 @@ endpoint_tests()
chk_mptcp_info subflows 3 subflows 3
done
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
kill_events_pids
chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1
@@ -4048,7 +4046,7 @@ endpoint_tests()
# broadcast IP: no packet for this address will be received on ns1
pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal
pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal
- { test_linkfail=4 speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
@@ -4057,39 +4055,46 @@ endpoint_tests()
$ns1 10.0.2.1 id 1 flags signal
chk_subflow_nr "before delete" 2
chk_mptcp_info subflows 1 subflows 1
+ chk_mptcp_info add_addr_signal 2 add_addr_accepted 1
pm_nl_del_endpoint $ns1 1 10.0.2.1
pm_nl_del_endpoint $ns1 2 224.0.0.1
sleep 0.5
chk_subflow_nr "after delete" 1
chk_mptcp_info subflows 0 subflows 0
+ chk_mptcp_info add_addr_signal 0 add_addr_accepted 0
pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal
pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal
wait_mpj $ns2
chk_subflow_nr "after re-add" 3
chk_mptcp_info subflows 2 subflows 2
+ chk_mptcp_info add_addr_signal 2 add_addr_accepted 2
pm_nl_del_endpoint $ns1 42 10.0.1.1
sleep 0.5
chk_subflow_nr "after delete ID 0" 2
chk_mptcp_info subflows 2 subflows 2
+ chk_mptcp_info add_addr_signal 2 add_addr_accepted 2
pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal
wait_mpj $ns2
chk_subflow_nr "after re-add ID 0" 3
chk_mptcp_info subflows 3 subflows 3
+ chk_mptcp_info add_addr_signal 3 add_addr_accepted 2
pm_nl_del_endpoint $ns1 99 10.0.1.1
sleep 0.5
chk_subflow_nr "after re-delete ID 0" 2
chk_mptcp_info subflows 2 subflows 2
+ chk_mptcp_info add_addr_signal 2 add_addr_accepted 2
pm_nl_add_endpoint $ns1 10.0.1.1 id 88 flags signal
wait_mpj $ns2
chk_subflow_nr "after re-re-add ID 0" 3
chk_mptcp_info subflows 3 subflows 3
- mptcp_lib_kill_wait $tests_pid
+ chk_mptcp_info add_addr_signal 3 add_addr_accepted 2
+ mptcp_lib_kill_group_wait $tests_pid
kill_events_pids
chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1
@@ -4121,7 +4126,7 @@ endpoint_tests()
# broadcast IP: no packet for this address will be received on ns1
pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal
pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow
- { test_linkfail=4 speed=20 \
+ { timeout_test=120 test_linkfail=128 speed=20 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
@@ -4137,7 +4142,7 @@ endpoint_tests()
wait_mpj $ns2
pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal
wait_mpj $ns2
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
join_syn_tx=3 join_connect_err=1 \
chk_join_nr 2 2 2
diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
index d62e653d48b0..f4388900016a 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
@@ -350,6 +350,27 @@ mptcp_lib_kill_wait() {
wait "${1}" 2>/dev/null
}
+# $1: PID
+mptcp_lib_pid_list_children() {
+ local curr="${1}"
+ # evoke 'ps' only once
+ local pids="${2:-"$(ps o pid,ppid)"}"
+
+ echo "${curr}"
+
+ local pid
+ for pid in $(echo "${pids}" | awk "\$2 == ${curr} { print \$1 }"); do
+ mptcp_lib_pid_list_children "${pid}" "${pids}"
+ done
+}
+
+# $1: PID
+mptcp_lib_kill_group_wait() {
+ # Some users might not have procps-ng: cannot use "kill -- -PID"
+ mptcp_lib_pid_list_children "${1}" | xargs -r kill &>/dev/null
+ wait "${1}" 2>/dev/null
+}
+
# $1: IP address
mptcp_lib_is_v6() {
[ -z "${1##*:*}" ]
diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc
index 330e000baeb1..f9d43cbdc894 100644
--- a/tools/testing/selftests/nolibc/Makefile.nolibc
+++ b/tools/testing/selftests/nolibc/Makefile.nolibc
@@ -87,7 +87,6 @@ IMAGE_riscv = arch/riscv/boot/Image
IMAGE_riscv32 = arch/riscv/boot/Image
IMAGE_riscv64 = arch/riscv/boot/Image
IMAGE_s390x = arch/s390/boot/bzImage
-IMAGE_s390 = arch/s390/boot/bzImage
IMAGE_loongarch = arch/loongarch/boot/vmlinuz.efi
IMAGE_sparc32 = arch/sparc/boot/image
IMAGE_sparc64 = arch/sparc/boot/image
@@ -117,7 +116,6 @@ DEFCONFIG_riscv = defconfig
DEFCONFIG_riscv32 = rv32_defconfig
DEFCONFIG_riscv64 = defconfig
DEFCONFIG_s390x = defconfig
-DEFCONFIG_s390 = defconfig compat.config
DEFCONFIG_loongarch = defconfig
DEFCONFIG_sparc32 = sparc32_defconfig
DEFCONFIG_sparc64 = sparc64_defconfig
@@ -156,7 +154,6 @@ QEMU_ARCH_riscv = riscv64
QEMU_ARCH_riscv32 = riscv32
QEMU_ARCH_riscv64 = riscv64
QEMU_ARCH_s390x = s390x
-QEMU_ARCH_s390 = s390x
QEMU_ARCH_loongarch = loongarch64
QEMU_ARCH_sparc32 = sparc
QEMU_ARCH_sparc64 = sparc64
@@ -197,7 +194,6 @@ QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_T
QEMU_ARGS_riscv32 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
QEMU_ARGS_riscv64 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
QEMU_ARGS_s390x = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
-QEMU_ARGS_s390 = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
QEMU_ARGS_loongarch = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
QEMU_ARGS_sparc32 = -M SS-5 -m 256M -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
QEMU_ARGS_sparc64 = -M sun4u -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
@@ -223,13 +219,13 @@ CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple)
CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple)
CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2)
CFLAGS_s390x = -m64
-CFLAGS_s390 = -m31
CFLAGS_mips32le = -EL -mabi=32 -fPIC
CFLAGS_mips32be = -EB -mabi=32
CFLAGS_mipsn32le = -EL -mabi=n32 -fPIC -march=mips64r2
CFLAGS_mipsn32be = -EB -mabi=n32 -march=mips64r6
CFLAGS_mips64le = -EL -mabi=64 -march=mips64r6
CFLAGS_mips64be = -EB -mabi=64 -march=mips64r2
+CFLAGS_loongarch = $(if $(LLVM),-fuse-ld=lld)
CFLAGS_sparc32 = $(call cc-option,-m32)
CFLAGS_sh4 = -ml -m4
ifeq ($(origin XARCH),command line)
diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 29de21595fc9..3c5a226dad3a 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -25,6 +25,7 @@
#include <sys/sysmacros.h>
#include <sys/time.h>
#include <sys/timerfd.h>
+#include <sys/uio.h>
#include <sys/utsname.h>
#include <sys/wait.h>
#include <dirent.h>
@@ -1282,6 +1283,10 @@ int run_syscall(int min, int max)
int proc;
int test;
int tmp;
+ struct iovec iov_one = {
+ .iov_base = &tmp,
+ .iov_len = 1,
+ };
int ret = 0;
void *p1, *p2;
int has_gettid = 1;
@@ -1343,6 +1348,8 @@ int run_syscall(int min, int max)
CASE_TEST(dup3_0); tmp = dup3(0, 100, 0); EXPECT_SYSNE(1, tmp, -1); close(tmp); break;
CASE_TEST(dup3_m1); tmp = dup3(-1, 100, 0); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break;
CASE_TEST(execve_root); EXPECT_SYSER(1, execve("/", (char*[]){ [0] = "/", [1] = NULL }, NULL), -1, EACCES); break;
+ CASE_TEST(fchdir_stdin); EXPECT_SYSER(1, fchdir(STDIN_FILENO), -1, ENOTDIR); break;
+ CASE_TEST(fchdir_badfd); EXPECT_SYSER(1, fchdir(-1), -1, EBADF); break;
CASE_TEST(file_stream); EXPECT_SYSZR(1, test_file_stream()); break;
CASE_TEST(fork); EXPECT_SYSZR(1, test_fork(FORK_STANDARD)); break;
CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break;
@@ -1395,6 +1402,10 @@ int run_syscall(int min, int max)
CASE_TEST(waitpid_child); EXPECT_SYSER(1, waitpid(getpid(), &tmp, WNOHANG), -1, ECHILD); break;
CASE_TEST(write_badf); EXPECT_SYSER(1, write(-1, &tmp, 1), -1, EBADF); break;
CASE_TEST(write_zero); EXPECT_SYSZR(1, write(1, &tmp, 0)); break;
+ CASE_TEST(readv_badf); EXPECT_SYSER(1, readv(-1, &iov_one, 1), -1, EBADF); break;
+ CASE_TEST(readv_zero); EXPECT_SYSZR(1, readv(1, NULL, 0)); break;
+ CASE_TEST(writev_badf); EXPECT_SYSER(1, writev(-1, &iov_one, 1), -1, EBADF); break;
+ CASE_TEST(writev_zero); EXPECT_SYSZR(1, writev(1, NULL, 0)); break;
CASE_TEST(syscall_noargs); EXPECT_SYSEQ(1, syscall(__NR_getpid), getpid()); break;
CASE_TEST(syscall_args); EXPECT_SYSER(1, syscall(__NR_statx, 0, NULL, 0, 0, NULL), -1, EFAULT); break;
CASE_TEST(namespace); EXPECT_SYSZR(euid0 && proc, test_namespace()); break;
@@ -1540,6 +1551,8 @@ int run_stdlib(int min, int max)
CASE_TEST(abs); EXPECT_EQ(1, abs(-10), 10); break;
CASE_TEST(abs_noop); EXPECT_EQ(1, abs(10), 10); break;
CASE_TEST(difftime); EXPECT_ZR(1, test_difftime()); break;
+ CASE_TEST(memchr_foobar6_o); EXPECT_STREQ(1, memchr("foobar", 'o', 6), "oobar"); break;
+ CASE_TEST(memchr_foobar3_b); EXPECT_STRZR(1, memchr("foobar", 'b', 3)); break;
case __LINE__:
return ret; /* must be last */
diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh
index e8af1fb505cf..3917cfb8fdc4 100755
--- a/tools/testing/selftests/nolibc/run-tests.sh
+++ b/tools/testing/selftests/nolibc/run-tests.sh
@@ -23,7 +23,7 @@ all_archs=(
mips32le mips32be mipsn32le mipsn32be mips64le mips64be
ppc ppc64 ppc64le
riscv32 riscv64
- s390x s390
+ s390x
loongarch
sparc32 sparc64
m68k
@@ -169,7 +169,7 @@ test_arch() {
cross_compile=$(realpath "${download_location}gcc-${crosstool_version}-nolibc/${ct_arch}-${ct_abi}/bin/${ct_arch}-${ct_abi}-")
build_dir="${build_location}/${arch}"
if [ "$werror" -ne 0 ]; then
- CFLAGS_EXTRA="$CFLAGS_EXTRA -Werror"
+ CFLAGS_EXTRA="$CFLAGS_EXTRA -Werror -Wl,--fatal-warnings"
fi
MAKE=(make -f Makefile.nolibc -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" LLVM="${llvm}" O="${build_dir}")
@@ -185,10 +185,6 @@ test_arch() {
exit 1
esac
printf '%-15s' "$arch:"
- if [ "$arch" = "s390" ] && ([ "$llvm" = "1" ] || [ "$test_mode" = "user" ]); then
- echo "Unsupported configuration"
- return
- fi
if [ "$arch" = "m68k" -o "$arch" = "sh4" ] && [ "$llvm" = "1" ]; then
echo "Unsupported configuration"
return
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
index f87993def738..d60f10a873bb 100644
--- a/tools/testing/selftests/pidfd/pidfd.h
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -148,6 +148,14 @@
#define PIDFD_INFO_COREDUMP (1UL << 4)
#endif
+#ifndef PIDFD_INFO_SUPPORTED_MASK
+#define PIDFD_INFO_SUPPORTED_MASK (1UL << 5)
+#endif
+
+#ifndef PIDFD_INFO_COREDUMP_SIGNAL
+#define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6)
+#endif
+
#ifndef PIDFD_COREDUMPED
#define PIDFD_COREDUMPED (1U << 0) /* Did crash and... */
#endif
@@ -183,8 +191,11 @@ struct pidfd_info {
__u32 fsuid;
__u32 fsgid;
__s32 exit_code;
- __u32 coredump_mask;
- __u32 __spare1;
+ struct {
+ __u32 coredump_mask;
+ __u32 coredump_signal;
+ };
+ __u64 supported_mask;
};
/*
diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c
index a0eb6e81eaa2..cb5430a2fd75 100644
--- a/tools/testing/selftests/pidfd/pidfd_info_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_info_test.c
@@ -690,4 +690,77 @@ TEST_F(pidfd_info, thread_group_exec_thread)
EXPECT_EQ(close(pidfd_thread), 0);
}
+/*
+ * Test: PIDFD_INFO_SUPPORTED_MASK field
+ *
+ * Verify that when PIDFD_INFO_SUPPORTED_MASK is requested, the kernel
+ * returns the supported_mask field indicating which flags the kernel supports.
+ */
+TEST(supported_mask_field)
+{
+ struct pidfd_info info = {
+ .mask = PIDFD_INFO_SUPPORTED_MASK,
+ };
+ int pidfd;
+ pid_t pid;
+
+ pid = create_child(&pidfd, 0);
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0)
+ pause();
+
+ /* Request supported_mask field */
+ ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0);
+
+ /* Verify PIDFD_INFO_SUPPORTED_MASK is set in the reply */
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK));
+
+ /* Verify supported_mask contains expected flags */
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_PID));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CREDS));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CGROUPID));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_EXIT));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_SUPPORTED_MASK));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP_SIGNAL));
+
+ /* Clean up */
+ sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
+ sys_waitid(P_PIDFD, pidfd, NULL, WEXITED);
+ close(pidfd);
+}
+
+/*
+ * Test: PIDFD_INFO_SUPPORTED_MASK always available
+ *
+ * Verify that supported_mask is returned even when other fields are requested.
+ */
+TEST(supported_mask_with_other_fields)
+{
+ struct pidfd_info info = {
+ .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_SUPPORTED_MASK,
+ };
+ int pidfd;
+ pid_t pid;
+
+ pid = create_child(&pidfd, 0);
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0)
+ pause();
+
+ ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0);
+
+ /* Both fields should be present */
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CGROUPID));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK));
+ ASSERT_NE(info.supported_mask, 0);
+
+ /* Clean up */
+ sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
+ sys_waitid(P_PIDFD, pidfd, NULL, WEXITED);
+ close(pidfd);
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh
index 88ca4e368489..b5239b52cb5d 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh
@@ -31,7 +31,7 @@ fi
if ! cp "$oldrun/scenarios" $T/scenarios.oldrun
then
# Later on, can reconstitute this from console.log files.
- echo Prior run batches file does not exist: $oldrun/batches
+ echo Prior run scenarios file does not exist: $oldrun/scenarios
exit 1
fi
@@ -68,7 +68,7 @@ usage () {
echo " --datestamp string"
echo " --dryrun"
echo " --duration minutes | <seconds>s | <hours>h | <days>d"
- echo " --link hard|soft|copy"
+ echo " --link hard|soft|copy|inplace|inplace-force"
echo " --remote"
echo " --rundir /new/res/path"
echo "Command line: $scriptname $args"
@@ -121,7 +121,7 @@ do
shift
;;
--link)
- checkarg --link "hard|soft|copy" "$#" "$2" 'hard\|soft\|copy' '^--'
+ checkarg --link "hard|soft|copy|inplace|inplace-force" "$#" "$2" 'hard\|soft\|copy\|inplace\|inplace-force' '^--'
case "$2" in
copy)
arg_link="cp -R"
@@ -132,6 +132,14 @@ do
soft)
arg_link="cp -Rs"
;;
+ inplace)
+ arg_link="inplace"
+ rundir="$oldrun"
+ ;;
+ inplace-force)
+ arg_link="inplace-force"
+ rundir="$oldrun"
+ ;;
esac
shift
;;
@@ -172,21 +180,37 @@ fi
echo ---- Re-run results directory: $rundir
-# Copy old run directory tree over and adjust.
-mkdir -p "`dirname "$rundir"`"
-if ! $arg_link "$oldrun" "$rundir"
-then
- echo "Cannot copy from $oldrun to $rundir."
- usage
-fi
-rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log
-touch "$rundir/log"
-echo $scriptname $args | tee -a "$rundir/log"
-echo $oldrun > "$rundir/re-run"
-if ! test -d "$rundir/../../bin"
+if test "$oldrun" != "$rundir"
then
- $arg_link "$oldrun/../../bin" "$rundir/../.."
+ # Copy old run directory tree over and adjust.
+ mkdir -p "`dirname "$rundir"`"
+ if ! $arg_link "$oldrun" "$rundir"
+ then
+ echo "Cannot copy from $oldrun to $rundir."
+ usage
+ fi
+ rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log
+ touch "$rundir/log"
+ echo $scriptname $args | tee -a "$rundir/log"
+ echo $oldrun > "$rundir/re-run"
+ if ! test -d "$rundir/../../bin"
+ then
+ $arg_link "$oldrun/../../bin" "$rundir/../.."
+ fi
+else
+ # Check for a run having already happened.
+ find "$rundir" -name console.log -print > $T/oldrun-console.log
+ if test -s $T/oldrun-console.log
+ then
+ echo Run already took place in $rundir
+ if test "$arg_link" = inplace
+ then
+ usage
+ fi
+ fi
fi
+
+# Find runs to be done based on their qemu-cmd files.
for i in $rundir/*/qemu-cmd
do
cp "$i" $T
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh
new file mode 100755
index 000000000000..2ff905a1853b
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Usage: kvm-series.sh config-list commit-id-list [ kvm.sh parameters ]
+#
+# Tests the specified list of unadorned configs ("TREE01 SRCU-P" but not
+# "CFLIST" or "3*TRACE01") and an indication of a set of commits to test,
+# then runs each commit through the specified list of commits using kvm.sh.
+# The runs are grouped into a -series/config/commit directory tree.
+# Each run defaults to a duration of one minute.
+#
+# Run in top-level Linux source directory. Please note that this is in
+# no way a replacement for "git bisect"!!!
+#
+# This script is intended to replace kvm-check-branches.sh by providing
+# ease of use and faster execution.
+
+T="`mktemp -d ${TMPDIR-/tmp}/kvm-series.sh.XXXXXX`"
+trap 'rm -rf $T' 0
+
+scriptname=$0
+args="$*"
+
+config_list="${1}"
+if test -z "${config_list}"
+then
+ echo "$0: Need a quoted list of --config arguments for first argument."
+ exit 1
+fi
+if test -z "${config_list}" || echo "${config_list}" | grep -q '\*'
+then
+ echo "$0: Repetition ('*') not allowed in config list."
+ exit 1
+fi
+
+commit_list="${2}"
+if test -z "${commit_list}"
+then
+ echo "$0: Need a list of commits (e.g., HEAD^^^..) for second argument."
+ exit 2
+fi
+git log --pretty=format:"%h" "${commit_list}" > $T/commits
+ret=$?
+if test "${ret}" -ne 0
+then
+ echo "$0: Invalid commit list ('${commit_list}')."
+ exit 2
+fi
+sha1_list=`cat $T/commits`
+
+shift
+shift
+
+RCUTORTURE="`pwd`/tools/testing/selftests/rcutorture"; export RCUTORTURE
+PATH=${RCUTORTURE}/bin:$PATH; export PATH
+. functions.sh
+
+ret=0
+nfail=0
+nsuccess=0
+faillist=
+successlist=
+cursha1="`git rev-parse --abbrev-ref HEAD`"
+ds="`date +%Y.%m.%d-%H.%M.%S`-series"
+startdate="`date`"
+starttime="`get_starttime`"
+
+echo " --- " $scriptname $args | tee -a $T/log
+echo " --- Results directory: " $ds | tee -a $T/log
+
+for config in ${config_list}
+do
+ sha_n=0
+ for sha in ${sha1_list}
+ do
+ sha1=${sha_n}.${sha} # Enable "sort -k1nr" to list commits in order.
+ echo Starting ${config}/${sha1} at `date` | tee -a $T/log
+ git checkout "${sha}"
+ time tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 "$@"
+ curret=$?
+ if test "${curret}" -ne 0
+ then
+ nfail=$((nfail+1))
+ faillist="$faillist ${config}/${sha1}(${curret})"
+ else
+ nsuccess=$((nsuccess+1))
+ successlist="$successlist ${config}/${sha1}"
+ # Successful run, so remove large files.
+ rm -f ${RCUTORTURE}/$ds/${config}/${sha1}/{vmlinux,bzImage,System.map,Module.symvers}
+ fi
+ if test "${ret}" -eq 0
+ then
+ ret=${curret}
+ fi
+ sha_n=$((sha_n+1))
+ done
+done
+git checkout "${cursha1}"
+
+echo ${nsuccess} SUCCESSES: | tee -a $T/log
+echo ${successlist} | fmt | tee -a $T/log
+echo | tee -a $T/log
+echo ${nfail} FAILURES: | tee -a $T/log
+echo ${faillist} | fmt | tee -a $T/log
+if test -n "${faillist}"
+then
+ echo | tee -a $T/log
+ echo Failures across commits: | tee -a $T/log
+ echo ${faillist} | tr ' ' '\012' | sed -e 's,^[^/]*/,,' -e 's/([0-9]*)//' |
+ sort | uniq -c | sort -k2n | tee -a $T/log
+fi
+echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log
+echo Summary: Successes: ${nsuccess} Failures: ${nfail} | tee -a $T/log
+cp $T/log tools/testing/selftests/rcutorture/res/${ds}
+
+exit "${ret}"
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 617cba339d28..fff15821c44c 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -199,7 +199,7 @@ do
fi
;;
--kconfig|--kconfigs)
- checkarg --kconfig "(Kconfig options)" $# "$2" '^\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\( \+\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\)* *$' '^error$'
+ checkarg --kconfig "(Kconfig options)" $# "$2" '^\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|-\?[0-9]\+\|"[^"]*"\)\( \+\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|-\?[0-9]\+\|"[^"]*"\)\)* *$' '^error$'
TORTURE_KCONFIG_ARG="`echo "$TORTURE_KCONFIG_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`"
shift
;;
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
index dc4985064b3a..67caf4276bb0 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
@@ -16,3 +16,4 @@ CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
CONFIG_RCU_EXPERT=y
CONFIG_RCU_EQS_DEBUG=y
CONFIG_RCU_LAZY=y
+CONFIG_RCU_DYNTICKS_TORTURE=y
diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h
index 33baaa9f9997..e7b858cd3736 100644
--- a/tools/testing/selftests/rseq/rseq-s390.h
+++ b/tools/testing/selftests/rseq/rseq-s390.h
@@ -28,8 +28,6 @@ do { \
RSEQ_WRITE_ONCE(*(p), v); \
} while (0)
-#ifdef __s390x__
-
#define LONG_L "lg"
#define LONG_S "stg"
#define LONG_LT_R "ltgr"
@@ -63,43 +61,6 @@ do { \
".quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n\t" \
".popsection\n\t"
-#elif __s390__
-
-#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
- start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_cs, \"aw\"\n\t" \
- ".balign 32\n\t" \
- __rseq_str(label) ":\n\t" \
- ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
- ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
- ".popsection\n\t" \
- ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
- ".long 0x0, " __rseq_str(label) "b\n\t" \
- ".popsection\n\t"
-
-/*
- * Exit points of a rseq critical section consist of all instructions outside
- * of the critical section where a critical section can either branch to or
- * reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_cs section and should not be
- * explicitly defined as additional exit points. Knowing all exit points is
- * useful to assist debuggers stepping over the critical section.
- */
-#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
- ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \
- ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) "\n\t" \
- ".popsection\n\t"
-
-#define LONG_L "l"
-#define LONG_S "st"
-#define LONG_LT_R "ltr"
-#define LONG_CMP "c"
-#define LONG_CMP_R "cr"
-#define LONG_ADDI "ahi"
-#define LONG_ADD_R "ar"
-
-#endif
-
#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
(post_commit_ip - start_ip), abort_ip)
diff --git a/tools/testing/selftests/run_kselftest.sh b/tools/testing/selftests/run_kselftest.sh
index 0443beacf362..d4be97498b32 100755
--- a/tools/testing/selftests/run_kselftest.sh
+++ b/tools/testing/selftests/run_kselftest.sh
@@ -33,6 +33,7 @@ Usage: $0 [OPTIONS]
-c | --collection COLLECTION Run all tests from COLLECTION
-l | --list List the available collection:test entries
-d | --dry-run Don't actually run any tests
+ -f | --no-error-on-fail Don't exit with an error just because tests failed
-n | --netns Run each test in namespace
-h | --help Show this usage info
-o | --override-timeout Number of seconds after which we timeout
@@ -44,6 +45,7 @@ COLLECTIONS=""
TESTS=""
dryrun=""
kselftest_override_timeout=""
+ERROR_ON_FAIL=true
while true; do
case "$1" in
-s | --summary)
@@ -65,6 +67,9 @@ while true; do
-d | --dry-run)
dryrun="echo"
shift ;;
+ -f | --no-error-on-fail)
+ ERROR_ON_FAIL=false
+ shift ;;
-n | --netns)
RUN_IN_NETNS=1
shift ;;
@@ -105,9 +110,18 @@ if [ -n "$TESTS" ]; then
available="$(echo "$valid" | sed -e 's/ /\n/g')"
fi
+kselftest_failures_file="$(mktemp --tmpdir kselftest-failures-XXXXXX)"
+export kselftest_failures_file
+
collections=$(echo "$available" | cut -d: -f1 | sort | uniq)
for collection in $collections ; do
[ -w /dev/kmsg ] && echo "kselftest: Running tests in $collection" >> /dev/kmsg
tests=$(echo "$available" | grep "^$collection:" | cut -d: -f2)
($dryrun cd "$collection" && $dryrun run_many $tests)
done
+
+failures="$(cat "$kselftest_failures_file")"
+rm "$kselftest_failures_file"
+if "$ERROR_ON_FAIL" && [ "$failures" ]; then
+ exit 1
+fi
diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 9d9d6b4c38b0..5fe45f9c5f8f 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -174,6 +174,7 @@ auto-test-targets := \
minimal \
numa \
allowed_cpus \
+ peek_dsq \
prog_run \
reload_loop \
select_cpu_dfl \
diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
new file mode 100644
index 000000000000..a3faf5bb49d6
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A BPF program for testing DSQ operations and peek in particular.
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
+ */
+
+#include <scx/common.bpf.h>
+#include <scx/compat.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei); /* Error handling */
+
+#define MAX_SAMPLES 100
+#define MAX_CPUS 512
+#define DSQ_POOL_SIZE 8
+int max_samples = MAX_SAMPLES;
+int max_cpus = MAX_CPUS;
+int dsq_pool_size = DSQ_POOL_SIZE;
+
+/* Global variables to store test results */
+int dsq_peek_result1 = -1;
+long dsq_inserted_pid = -1;
+int insert_test_cpu = -1; /* Set to the cpu that performs the test */
+long dsq_peek_result2 = -1;
+long dsq_peek_result2_pid = -1;
+long dsq_peek_result2_expected = -1;
+int test_dsq_id = 1234; /* Use a simple ID like create_dsq example */
+int real_dsq_id = 1235; /* DSQ for normal operation */
+int enqueue_count = -1;
+int dispatch_count = -1;
+bool debug_ksym_exists;
+
+/* DSQ pool for stress testing */
+int dsq_pool_base_id = 2000;
+int phase1_complete = -1;
+long total_peek_attempts = -1;
+long successful_peeks = -1;
+
+/* BPF map for sharing peek results with userspace */
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, MAX_SAMPLES);
+ __type(key, u32);
+ __type(value, long);
+} peek_results SEC(".maps");
+
+static int get_random_dsq_id(void)
+{
+ u64 time = bpf_ktime_get_ns();
+
+ return dsq_pool_base_id + (time % DSQ_POOL_SIZE);
+}
+
+static void record_peek_result(long pid)
+{
+ u32 slot_key;
+ long *slot_pid_ptr;
+ int ix;
+
+ if (pid <= 0)
+ return;
+
+ /* Find an empty slot or one with the same PID */
+ bpf_for(ix, 0, 10) {
+ slot_key = (pid + ix) % MAX_SAMPLES;
+ slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key);
+ if (!slot_pid_ptr)
+ continue;
+
+ if (*slot_pid_ptr == -1 || *slot_pid_ptr == pid) {
+ *slot_pid_ptr = pid;
+ break;
+ }
+ }
+}
+
+/* Scan all DSQs in the pool and try to move a task to local */
+static int scan_dsq_pool(void)
+{
+ struct task_struct *task;
+ int moved = 0;
+ int i;
+
+ bpf_for(i, 0, DSQ_POOL_SIZE) {
+ int dsq_id = dsq_pool_base_id + i;
+
+ total_peek_attempts++;
+
+ task = __COMPAT_scx_bpf_dsq_peek(dsq_id);
+ if (task) {
+ successful_peeks++;
+ record_peek_result(task->pid);
+
+ /* Try to move this task to local */
+ if (!moved && scx_bpf_dsq_move_to_local(dsq_id) == 0) {
+ moved = 1;
+ break;
+ }
+ }
+ }
+ return moved;
+}
+
+/* Struct_ops scheduler for testing DSQ peek operations */
+void BPF_STRUCT_OPS(peek_dsq_enqueue, struct task_struct *p, u64 enq_flags)
+{
+ struct task_struct *peek_result;
+ int last_insert_test_cpu, cpu;
+
+ enqueue_count++;
+ cpu = bpf_get_smp_processor_id();
+ last_insert_test_cpu = __sync_val_compare_and_swap(&insert_test_cpu, -1, cpu);
+
+ /* Phase 1: Simple insert-then-peek test (only on first task) */
+ if (last_insert_test_cpu == -1) {
+ bpf_printk("peek_dsq_enqueue beginning phase 1 peek test on cpu %d", cpu);
+
+ /* Test 1: Peek empty DSQ - should return NULL */
+ peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
+ dsq_peek_result1 = (long)peek_result; /* Should be 0 (NULL) */
+
+ /* Test 2: Insert task into test DSQ for testing in dispatch callback */
+ dsq_inserted_pid = p->pid;
+ scx_bpf_dsq_insert(p, test_dsq_id, 0, enq_flags);
+ dsq_peek_result2_expected = (long)p; /* Expected the task we just inserted */
+ } else if (!phase1_complete) {
+ /* Still in phase 1, use real DSQ */
+ scx_bpf_dsq_insert(p, real_dsq_id, 0, enq_flags);
+ } else {
+ /* Phase 2: Random DSQ insertion for stress testing */
+ int random_dsq_id = get_random_dsq_id();
+
+ scx_bpf_dsq_insert(p, random_dsq_id, 0, enq_flags);
+ }
+}
+
+void BPF_STRUCT_OPS(peek_dsq_dispatch, s32 cpu, struct task_struct *prev)
+{
+ dispatch_count++;
+
+ /* Phase 1: Complete the simple peek test if we inserted a task but
+ * haven't tested peek yet
+ */
+ if (insert_test_cpu == cpu && dsq_peek_result2 == -1) {
+ struct task_struct *peek_result;
+
+ bpf_printk("peek_dsq_dispatch completing phase 1 peek test on cpu %d", cpu);
+
+ /* Test 3: Peek DSQ after insert - should return the task we inserted */
+ peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
+ /* Store the PID of the peeked task for comparison */
+ dsq_peek_result2 = (long)peek_result;
+ dsq_peek_result2_pid = peek_result ? peek_result->pid : -1;
+
+ /* Now consume the task since we've peeked at it */
+ scx_bpf_dsq_move_to_local(test_dsq_id);
+
+ /* Mark phase 1 as complete */
+ phase1_complete = 1;
+ bpf_printk("Phase 1 complete, starting phase 2 stress testing");
+ } else if (!phase1_complete) {
+ /* Still in phase 1, use real DSQ */
+ scx_bpf_dsq_move_to_local(real_dsq_id);
+ } else {
+ /* Phase 2: Scan all DSQs in the pool and try to move a task */
+ if (!scan_dsq_pool()) {
+ /* No tasks found in DSQ pool, fall back to real DSQ */
+ scx_bpf_dsq_move_to_local(real_dsq_id);
+ }
+ }
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(peek_dsq_init)
+{
+ s32 err;
+ int i;
+
+ /* Always set debug values so we can see which version we're using */
+ debug_ksym_exists = bpf_ksym_exists(scx_bpf_dsq_peek) ? 1 : 0;
+
+ /* Initialize state first */
+ insert_test_cpu = -1;
+ enqueue_count = 0;
+ dispatch_count = 0;
+ phase1_complete = 0;
+ total_peek_attempts = 0;
+ successful_peeks = 0;
+
+ /* Create the test and real DSQs */
+ err = scx_bpf_create_dsq(test_dsq_id, -1);
+ if (err) {
+ scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
+ return err;
+ }
+ err = scx_bpf_create_dsq(real_dsq_id, -1);
+ if (err) {
+ scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
+ return err;
+ }
+
+ /* Create the DSQ pool for stress testing */
+ bpf_for(i, 0, DSQ_POOL_SIZE) {
+ int dsq_id = dsq_pool_base_id + i;
+
+ err = scx_bpf_create_dsq(dsq_id, -1);
+ if (err) {
+ scx_bpf_error("Failed to create DSQ pool entry %d: %d", dsq_id, err);
+ return err;
+ }
+ }
+
+ /* Initialize the peek results map */
+ bpf_for(i, 0, MAX_SAMPLES) {
+ u32 key = i;
+ long pid = -1;
+
+ bpf_map_update_elem(&peek_results, &key, &pid, BPF_ANY);
+ }
+
+ return 0;
+}
+
+void BPF_STRUCT_OPS(peek_dsq_exit, struct scx_exit_info *ei)
+{
+ int i;
+
+ /* Destroy the primary DSQs */
+ scx_bpf_destroy_dsq(test_dsq_id);
+ scx_bpf_destroy_dsq(real_dsq_id);
+
+ /* Destroy the DSQ pool */
+ bpf_for(i, 0, DSQ_POOL_SIZE) {
+ int dsq_id = dsq_pool_base_id + i;
+
+ scx_bpf_destroy_dsq(dsq_id);
+ }
+
+ UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops peek_dsq_ops = {
+ .enqueue = (void *)peek_dsq_enqueue,
+ .dispatch = (void *)peek_dsq_dispatch,
+ .init = (void *)peek_dsq_init,
+ .exit = (void *)peek_dsq_exit,
+ .name = "peek_dsq",
+};
diff --git a/tools/testing/selftests/sched_ext/peek_dsq.c b/tools/testing/selftests/sched_ext/peek_dsq.c
new file mode 100644
index 000000000000..a717384a3224
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/peek_dsq.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for DSQ operations including create, destroy, and peek operations.
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <string.h>
+#include <sched.h>
+#include "peek_dsq.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_WORKERS 4
+
+static bool workload_running = true;
+static pthread_t workload_threads[NUM_WORKERS];
+
+/**
+ * Background workload thread that sleeps and wakes rapidly to exercise
+ * the scheduler's enqueue operations and ensure DSQ operations get tested.
+ */
+static void *workload_thread_fn(void *arg)
+{
+ while (workload_running) {
+ /* Sleep for a very short time to trigger scheduler activity */
+ usleep(1000); /* 1ms sleep */
+ /* Yield to ensure we go through the scheduler */
+ sched_yield();
+ }
+ return NULL;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct peek_dsq *skel;
+
+ skel = peek_dsq__open();
+ SCX_FAIL_IF(!skel, "Failed to open");
+ SCX_ENUM_INIT(skel);
+ SCX_FAIL_IF(peek_dsq__load(skel), "Failed to load skel");
+
+ *ctx = skel;
+
+ return SCX_TEST_PASS;
+}
+
+static int print_observed_pids(struct bpf_map *map, int max_samples, const char *dsq_name)
+{
+ long count = 0;
+
+ printf("Observed %s DSQ peek pids:\n", dsq_name);
+ for (int i = 0; i < max_samples; i++) {
+ long pid;
+ int err;
+
+ err = bpf_map_lookup_elem(bpf_map__fd(map), &i, &pid);
+ if (err == 0) {
+ if (pid == 0) {
+ printf(" Sample %d: NULL peek\n", i);
+ } else if (pid > 0) {
+ printf(" Sample %d: pid %ld\n", i, pid);
+ count++;
+ }
+ } else {
+ printf(" Sample %d: error reading pid (err=%d)\n", i, err);
+ }
+ }
+ printf("Observed ~%ld pids in the %s DSQ(s)\n", count, dsq_name);
+ return count;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct peek_dsq *skel = ctx;
+ bool failed = false;
+ int seconds = 3;
+ int err;
+
+ /* Enable the scheduler to test DSQ operations */
+ printf("Enabling scheduler to test DSQ insert operations...\n");
+
+ struct bpf_link *link =
+ bpf_map__attach_struct_ops(skel->maps.peek_dsq_ops);
+
+ if (!link) {
+ SCX_ERR("Failed to attach struct_ops");
+ return SCX_TEST_FAIL;
+ }
+
+ printf("Starting %d background workload threads...\n", NUM_WORKERS);
+ workload_running = true;
+ for (int i = 0; i < NUM_WORKERS; i++) {
+ err = pthread_create(&workload_threads[i], NULL, workload_thread_fn, NULL);
+ if (err) {
+ SCX_ERR("Failed to create workload thread %d: %s", i, strerror(err));
+ /* Stop already created threads */
+ workload_running = false;
+ for (int j = 0; j < i; j++)
+ pthread_join(workload_threads[j], NULL);
+ bpf_link__destroy(link);
+ return SCX_TEST_FAIL;
+ }
+ }
+
+ printf("Waiting for enqueue events.\n");
+ sleep(seconds);
+ while (skel->data->enqueue_count <= 0) {
+ printf(".");
+ fflush(stdout);
+ sleep(1);
+ seconds++;
+ if (seconds >= 30) {
+ printf("\n\u2717 Timeout waiting for enqueue events\n");
+ /* Stop workload threads and cleanup */
+ workload_running = false;
+ for (int i = 0; i < NUM_WORKERS; i++)
+ pthread_join(workload_threads[i], NULL);
+ bpf_link__destroy(link);
+ return SCX_TEST_FAIL;
+ }
+ }
+
+ workload_running = false;
+ for (int i = 0; i < NUM_WORKERS; i++) {
+ err = pthread_join(workload_threads[i], NULL);
+ if (err) {
+ SCX_ERR("Failed to join workload thread %d: %s", i, strerror(err));
+ bpf_link__destroy(link);
+ return SCX_TEST_FAIL;
+ }
+ }
+ printf("Background workload threads stopped.\n");
+
+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
+
+ /* Detach the scheduler */
+ bpf_link__destroy(link);
+
+ printf("Enqueue/dispatch count over %d seconds: %d / %d\n", seconds,
+ skel->data->enqueue_count, skel->data->dispatch_count);
+ printf("Debug: ksym_exists=%d\n",
+ skel->bss->debug_ksym_exists);
+
+ /* Check DSQ insert result */
+ printf("DSQ insert test done on cpu: %d\n", skel->data->insert_test_cpu);
+ if (skel->data->insert_test_cpu != -1)
+ printf("\u2713 DSQ insert succeeded !\n");
+ else {
+ printf("\u2717 DSQ insert failed or not attempted\n");
+ failed = true;
+ }
+
+ /* Check DSQ peek results */
+ printf(" DSQ peek result 1 (before insert): %d\n",
+ skel->data->dsq_peek_result1);
+ if (skel->data->dsq_peek_result1 == 0)
+ printf("\u2713 DSQ peek verification success: peek returned NULL!\n");
+ else {
+ printf("\u2717 DSQ peek verification failed\n");
+ failed = true;
+ }
+
+ printf(" DSQ peek result 2 (after insert): %ld\n",
+ skel->data->dsq_peek_result2);
+ printf(" DSQ peek result 2, expected: %ld\n",
+ skel->data->dsq_peek_result2_expected);
+ if (skel->data->dsq_peek_result2 ==
+ skel->data->dsq_peek_result2_expected)
+ printf("\u2713 DSQ peek verification success: peek returned the inserted task!\n");
+ else {
+ printf("\u2717 DSQ peek verification failed\n");
+ failed = true;
+ }
+
+ printf(" Inserted test task -> pid: %ld\n", skel->data->dsq_inserted_pid);
+ printf(" DSQ peek result 2 -> pid: %ld\n", skel->data->dsq_peek_result2_pid);
+
+ int pid_count;
+
+ pid_count = print_observed_pids(skel->maps.peek_results,
+ skel->data->max_samples, "DSQ pool");
+ printf("Total non-null peek observations: %ld out of %ld\n",
+ skel->data->successful_peeks, skel->data->total_peek_attempts);
+
+ if (skel->bss->debug_ksym_exists && pid_count == 0) {
+ printf("\u2717 DSQ pool test failed: no successful peeks in native mode\n");
+ failed = true;
+ }
+ if (skel->bss->debug_ksym_exists && pid_count > 0)
+ printf("\u2713 DSQ pool test success: observed successful peeks in native mode\n");
+
+ if (failed)
+ return SCX_TEST_FAIL;
+ else
+ return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+ struct peek_dsq *skel = ctx;
+
+ if (workload_running) {
+ workload_running = false;
+ for (int i = 0; i < NUM_WORKERS; i++)
+ pthread_join(workload_threads[i], NULL);
+ }
+
+ peek_dsq__destroy(skel);
+}
+
+struct scx_test peek_dsq = {
+ .name = "peek_dsq",
+ .description =
+ "Test DSQ create/destroy operations and future peek functionality",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&peek_dsq)
diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
index 998e5a2f4579..0091bcd91c2c 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
@@ -961,5 +961,49 @@
"teardown": [
"$TC qdisc del dev $DUMMY root"
]
+ },
+ {
+ "id": "4989",
+ "name": "Try to add an fq child to an ingress qdisc",
+ "category": [
+ "qdisc",
+ "ingress"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DUMMY handle ffff:0 ingress"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY parent ffff:0 handle ffe0:0 fq",
+ "expExitCode": "2",
+ "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle ffe0:",
+ "matchJSON": [],
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress"
+ ]
+ },
+ {
+ "id": "c2b0",
+ "name": "Try to add an fq child to a clsact qdisc",
+ "category": [
+ "qdisc",
+ "ingress"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DUMMY handle ffff:0 clsact"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY parent ffff:0 handle ffe0:0 fq",
+ "expExitCode": "2",
+ "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle ffe0:",
+ "matchJSON": [],
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY clsact"
+ ]
}
]
diff --git a/tools/testing/selftests/timers/nanosleep.c b/tools/testing/selftests/timers/nanosleep.c
index 252c6308c569..10badae13ebe 100644
--- a/tools/testing/selftests/timers/nanosleep.c
+++ b/tools/testing/selftests/timers/nanosleep.c
@@ -116,6 +116,56 @@ int nanosleep_test(int clockid, long long ns)
return 0;
}
+static void dummy_event_handler(int val)
+{
+ /* No action needed */
+}
+
+static int nanosleep_test_remaining(int clockid)
+{
+ struct timespec rqtp = {}, rmtp = {};
+ struct itimerspec itimer = {};
+ struct sigaction sa = {};
+ timer_t timer;
+ int ret;
+
+ sa.sa_handler = dummy_event_handler;
+ ret = sigaction(SIGALRM, &sa, NULL);
+ if (ret)
+ return -1;
+
+ ret = timer_create(clockid, NULL, &timer);
+ if (ret)
+ return -1;
+
+ itimer.it_value.tv_nsec = NSEC_PER_SEC / 4;
+ ret = timer_settime(timer, 0, &itimer, NULL);
+ if (ret)
+ return -1;
+
+ rqtp.tv_nsec = NSEC_PER_SEC / 2;
+ ret = clock_nanosleep(clockid, 0, &rqtp, &rmtp);
+ if (ret != EINTR)
+ return -1;
+
+ ret = timer_delete(timer);
+ if (ret)
+ return -1;
+
+ sa.sa_handler = SIG_DFL;
+ ret = sigaction(SIGALRM, &sa, NULL);
+ if (ret)
+ return -1;
+
+ if (!in_order((struct timespec) {}, rmtp))
+ return -1;
+
+ if (!in_order(rmtp, rqtp))
+ return -1;
+
+ return 0;
+}
+
int main(int argc, char **argv)
{
long long length;
@@ -150,6 +200,11 @@ int main(int argc, char **argv)
}
length *= 100;
}
+ ret = nanosleep_test_remaining(clockid);
+ if (ret < 0) {
+ ksft_test_result_fail("%-31s\n", clockstring(clockid));
+ ksft_exit_fail();
+ }
ksft_test_result_pass("%-31s\n", clockstring(clockid));
next:
ret = 0;
diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index f0eceb0faf34..a563c438ac79 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -18,6 +18,7 @@
#include <time.h>
#include <include/vdso/time64.h>
#include <pthread.h>
+#include <stdbool.h>
#include "../kselftest.h"
@@ -670,8 +671,14 @@ static void check_timer_create_exact(void)
int main(int argc, char **argv)
{
+ bool run_sig_ign_tests = ksft_min_kernel_version(6, 13);
+
ksft_print_header();
- ksft_set_plan(19);
+ if (run_sig_ign_tests) {
+ ksft_set_plan(19);
+ } else {
+ ksft_set_plan(10);
+ }
ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n");
ksft_print_msg("based timers if other threads run on the CPU...\n");
@@ -695,15 +702,20 @@ int main(int argc, char **argv)
check_timer_create(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
check_timer_distribution();
- check_sig_ign(0);
- check_sig_ign(1);
- check_rearm();
- check_delete();
- check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
- check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
- check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
- check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
- check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID");
+ if (run_sig_ign_tests) {
+ check_sig_ign(0);
+ check_sig_ign(1);
+ check_rearm();
+ check_delete();
+ check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
+ check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
+ check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
+ check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
+ check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID");
+ } else {
+ ksft_print_msg("Skipping SIG_IGN tests on kernel < 6.13\n");
+ }
+
check_overrun(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
check_overrun(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
check_overrun(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID");
diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c
index 5288e768b207..68625362add2 100644
--- a/tools/testing/selftests/user_events/perf_test.c
+++ b/tools/testing/selftests/user_events/perf_test.c
@@ -236,7 +236,7 @@ TEST_F(user, perf_empty_events) {
ASSERT_EQ(1 << reg.enable_bit, self->check);
/* Ensure write shows up at correct offset */
- ASSERT_NE(-1, write(self->data_fd, &reg.write_index,
+ ASSERT_NE(-1, write(self->data_fd, (void *)&reg.write_index,
sizeof(reg.write_index)));
val = (void *)(((char *)perf_page) + perf_page->data_offset);
ASSERT_EQ(PERF_RECORD_SAMPLE, *val);
diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h
index 5fdd0f362337..50c261005111 100644
--- a/tools/testing/selftests/vDSO/vdso_config.h
+++ b/tools/testing/selftests/vDSO/vdso_config.h
@@ -25,10 +25,6 @@
#define VDSO_VERSION 1
#define VDSO_NAMES 0
#define VDSO_32BIT 1
-#elif defined (__s390__) && !defined(__s390x__)
-#define VDSO_VERSION 2
-#define VDSO_NAMES 0
-#define VDSO_32BIT 1
#elif defined (__s390x__)
#define VDSO_VERSION 2
#define VDSO_NAMES 0
diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h
index ed31606e01b7..69ec0c856481 100644
--- a/tools/testing/selftests/vfio/lib/include/vfio_util.h
+++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h
@@ -4,9 +4,12 @@
#include <fcntl.h>
#include <string.h>
-#include <linux/vfio.h>
+
+#include <uapi/linux/types.h>
+#include <linux/iommufd.h>
#include <linux/list.h>
#include <linux/pci_regs.h>
+#include <linux/vfio.h>
#include "../../../kselftest.h"
@@ -185,6 +188,13 @@ struct vfio_pci_device {
struct vfio_pci_driver driver;
};
+struct iova_allocator {
+ struct iommu_iova_range *ranges;
+ u32 nranges;
+ u32 range_idx;
+ u64 range_offset;
+};
+
/*
* Return the BDF string of the device that the test should use.
*
@@ -206,10 +216,36 @@ struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_
void vfio_pci_device_cleanup(struct vfio_pci_device *device);
void vfio_pci_device_reset(struct vfio_pci_device *device);
-void vfio_pci_dma_map(struct vfio_pci_device *device,
- struct vfio_dma_region *region);
-void vfio_pci_dma_unmap(struct vfio_pci_device *device,
- struct vfio_dma_region *region);
+struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device,
+ u32 *nranges);
+
+struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device);
+void iova_allocator_cleanup(struct iova_allocator *allocator);
+iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size);
+
+int __vfio_pci_dma_map(struct vfio_pci_device *device,
+ struct vfio_dma_region *region);
+int __vfio_pci_dma_unmap(struct vfio_pci_device *device,
+ struct vfio_dma_region *region,
+ u64 *unmapped);
+int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped);
+
+static inline void vfio_pci_dma_map(struct vfio_pci_device *device,
+ struct vfio_dma_region *region)
+{
+ VFIO_ASSERT_EQ(__vfio_pci_dma_map(device, region), 0);
+}
+
+static inline void vfio_pci_dma_unmap(struct vfio_pci_device *device,
+ struct vfio_dma_region *region)
+{
+ VFIO_ASSERT_EQ(__vfio_pci_dma_unmap(device, region, NULL), 0);
+}
+
+static inline void vfio_pci_dma_unmap_all(struct vfio_pci_device *device)
+{
+ VFIO_ASSERT_EQ(__vfio_pci_dma_unmap_all(device, NULL), 0);
+}
void vfio_pci_config_access(struct vfio_pci_device *device, bool write,
size_t config, size_t size, void *data);
diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
index 0921b2451ba5..b479a359da12 100644
--- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c
+++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
@@ -2,6 +2,7 @@
#include <dirent.h>
#include <fcntl.h>
#include <libgen.h>
+#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
@@ -11,11 +12,12 @@
#include <sys/mman.h>
#include <uapi/linux/types.h>
+#include <linux/iommufd.h>
#include <linux/limits.h>
#include <linux/mman.h>
+#include <linux/overflow.h>
#include <linux/types.h>
#include <linux/vfio.h>
-#include <linux/iommufd.h>
#include "../../../kselftest.h"
#include <vfio_util.h>
@@ -28,6 +30,249 @@
VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \
} while (0)
+static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz,
+ u32 *cap_offset)
+{
+ struct vfio_info_cap_header *hdr;
+
+ if (!*cap_offset)
+ return NULL;
+
+ VFIO_ASSERT_LT(*cap_offset, bufsz);
+ VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr));
+
+ hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset);
+ *cap_offset = hdr->next;
+
+ return hdr;
+}
+
+static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info,
+ u16 cap_id)
+{
+ struct vfio_info_cap_header *hdr;
+ u32 cap_offset = info->cap_offset;
+ u32 max_depth;
+ u32 depth = 0;
+
+ if (!(info->flags & VFIO_IOMMU_INFO_CAPS))
+ return NULL;
+
+ if (cap_offset)
+ VFIO_ASSERT_GE(cap_offset, sizeof(*info));
+
+ max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr);
+
+ while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) {
+ depth++;
+ VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n");
+
+ if (hdr->id == cap_id)
+ return hdr;
+ }
+
+ return NULL;
+}
+
+/* Return buffer including capability chain, if present. Free with free() */
+static struct vfio_iommu_type1_info *vfio_iommu_get_info(struct vfio_pci_device *device)
+{
+ struct vfio_iommu_type1_info *info;
+
+ info = malloc(sizeof(*info));
+ VFIO_ASSERT_NOT_NULL(info);
+
+ *info = (struct vfio_iommu_type1_info) {
+ .argsz = sizeof(*info),
+ };
+
+ ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info);
+ VFIO_ASSERT_GE(info->argsz, sizeof(*info));
+
+ info = realloc(info, info->argsz);
+ VFIO_ASSERT_NOT_NULL(info);
+
+ ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info);
+ VFIO_ASSERT_GE(info->argsz, sizeof(*info));
+
+ return info;
+}
+
+/*
+ * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to
+ * report iommufd's iommu_iova_range. Free with free().
+ */
+static struct iommu_iova_range *vfio_iommu_iova_ranges(struct vfio_pci_device *device,
+ u32 *nranges)
+{
+ struct vfio_iommu_type1_info_cap_iova_range *cap_range;
+ struct vfio_iommu_type1_info *info;
+ struct vfio_info_cap_header *hdr;
+ struct iommu_iova_range *ranges = NULL;
+
+ info = vfio_iommu_get_info(device);
+ hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE);
+ VFIO_ASSERT_NOT_NULL(hdr);
+
+ cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header);
+ VFIO_ASSERT_GT(cap_range->nr_iovas, 0);
+
+ ranges = calloc(cap_range->nr_iovas, sizeof(*ranges));
+ VFIO_ASSERT_NOT_NULL(ranges);
+
+ for (u32 i = 0; i < cap_range->nr_iovas; i++) {
+ ranges[i] = (struct iommu_iova_range){
+ .start = cap_range->iova_ranges[i].start,
+ .last = cap_range->iova_ranges[i].end,
+ };
+ }
+
+ *nranges = cap_range->nr_iovas;
+
+ free(info);
+ return ranges;
+}
+
+/* Return iova ranges of the device's IOAS. Free with free() */
+static struct iommu_iova_range *iommufd_iova_ranges(struct vfio_pci_device *device,
+ u32 *nranges)
+{
+ struct iommu_iova_range *ranges;
+ int ret;
+
+ struct iommu_ioas_iova_ranges query = {
+ .size = sizeof(query),
+ .ioas_id = device->ioas_id,
+ };
+
+ ret = ioctl(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query);
+ VFIO_ASSERT_EQ(ret, -1);
+ VFIO_ASSERT_EQ(errno, EMSGSIZE);
+ VFIO_ASSERT_GT(query.num_iovas, 0);
+
+ ranges = calloc(query.num_iovas, sizeof(*ranges));
+ VFIO_ASSERT_NOT_NULL(ranges);
+
+ query.allowed_iovas = (uintptr_t)ranges;
+
+ ioctl_assert(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query);
+ *nranges = query.num_iovas;
+
+ return ranges;
+}
+
+static int iova_range_comp(const void *a, const void *b)
+{
+ const struct iommu_iova_range *ra = a, *rb = b;
+
+ if (ra->start < rb->start)
+ return -1;
+
+ if (ra->start > rb->start)
+ return 1;
+
+ return 0;
+}
+
+/* Return sorted IOVA ranges of the device. Free with free(). */
+struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device,
+ u32 *nranges)
+{
+ struct iommu_iova_range *ranges;
+
+ if (device->iommufd)
+ ranges = iommufd_iova_ranges(device, nranges);
+ else
+ ranges = vfio_iommu_iova_ranges(device, nranges);
+
+ if (!ranges)
+ return NULL;
+
+ VFIO_ASSERT_GT(*nranges, 0);
+
+ /* Sort and check that ranges are sane and non-overlapping */
+ qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp);
+ VFIO_ASSERT_LT(ranges[0].start, ranges[0].last);
+
+ for (u32 i = 1; i < *nranges; i++) {
+ VFIO_ASSERT_LT(ranges[i].start, ranges[i].last);
+ VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start);
+ }
+
+ return ranges;
+}
+
+struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device)
+{
+ struct iova_allocator *allocator;
+ struct iommu_iova_range *ranges;
+ u32 nranges;
+
+ ranges = vfio_pci_iova_ranges(device, &nranges);
+ VFIO_ASSERT_NOT_NULL(ranges);
+
+ allocator = malloc(sizeof(*allocator));
+ VFIO_ASSERT_NOT_NULL(allocator);
+
+ *allocator = (struct iova_allocator){
+ .ranges = ranges,
+ .nranges = nranges,
+ .range_idx = 0,
+ .range_offset = 0,
+ };
+
+ return allocator;
+}
+
+void iova_allocator_cleanup(struct iova_allocator *allocator)
+{
+ free(allocator->ranges);
+ free(allocator);
+}
+
+iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size)
+{
+ VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n");
+ VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n");
+
+ for (;;) {
+ struct iommu_iova_range *range;
+ iova_t iova, last;
+
+ VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges,
+ "IOVA allocator out of space\n");
+
+ range = &allocator->ranges[allocator->range_idx];
+ iova = range->start + allocator->range_offset;
+
+ /* Check for sufficient space at the current offset */
+ if (check_add_overflow(iova, size - 1, &last) ||
+ last > range->last)
+ goto next_range;
+
+ /* Align iova to size */
+ iova = last & ~(size - 1);
+
+ /* Check for sufficient space at the aligned iova */
+ if (check_add_overflow(iova, size - 1, &last) ||
+ last > range->last)
+ goto next_range;
+
+ if (last == range->last) {
+ allocator->range_idx++;
+ allocator->range_offset = 0;
+ } else {
+ allocator->range_offset = last - range->start + 1;
+ }
+
+ return iova;
+
+next_range:
+ allocator->range_idx++;
+ allocator->range_offset = 0;
+ }
+}
+
iova_t __to_iova(struct vfio_pci_device *device, void *vaddr)
{
struct vfio_dma_region *region;
@@ -141,7 +386,7 @@ static void vfio_pci_irq_get(struct vfio_pci_device *device, u32 index,
ioctl_assert(device->fd, VFIO_DEVICE_GET_IRQ_INFO, irq_info);
}
-static void vfio_iommu_dma_map(struct vfio_pci_device *device,
+static int vfio_iommu_dma_map(struct vfio_pci_device *device,
struct vfio_dma_region *region)
{
struct vfio_iommu_type1_dma_map args = {
@@ -152,10 +397,13 @@ static void vfio_iommu_dma_map(struct vfio_pci_device *device,
.size = region->size,
};
- ioctl_assert(device->container_fd, VFIO_IOMMU_MAP_DMA, &args);
+ if (ioctl(device->container_fd, VFIO_IOMMU_MAP_DMA, &args))
+ return -errno;
+
+ return 0;
}
-static void iommufd_dma_map(struct vfio_pci_device *device,
+static int iommufd_dma_map(struct vfio_pci_device *device,
struct vfio_dma_region *region)
{
struct iommu_ioas_map args = {
@@ -169,54 +417,108 @@ static void iommufd_dma_map(struct vfio_pci_device *device,
.ioas_id = device->ioas_id,
};
- ioctl_assert(device->iommufd, IOMMU_IOAS_MAP, &args);
+ if (ioctl(device->iommufd, IOMMU_IOAS_MAP, &args))
+ return -errno;
+
+ return 0;
}
-void vfio_pci_dma_map(struct vfio_pci_device *device,
+int __vfio_pci_dma_map(struct vfio_pci_device *device,
struct vfio_dma_region *region)
{
+ int ret;
+
if (device->iommufd)
- iommufd_dma_map(device, region);
+ ret = iommufd_dma_map(device, region);
else
- vfio_iommu_dma_map(device, region);
+ ret = vfio_iommu_dma_map(device, region);
+
+ if (ret)
+ return ret;
list_add(&region->link, &device->dma_regions);
+
+ return 0;
}
-static void vfio_iommu_dma_unmap(struct vfio_pci_device *device,
- struct vfio_dma_region *region)
+static int vfio_iommu_dma_unmap(int fd, u64 iova, u64 size, u32 flags,
+ u64 *unmapped)
{
struct vfio_iommu_type1_dma_unmap args = {
.argsz = sizeof(args),
- .iova = region->iova,
- .size = region->size,
+ .iova = iova,
+ .size = size,
+ .flags = flags,
};
- ioctl_assert(device->container_fd, VFIO_IOMMU_UNMAP_DMA, &args);
+ if (ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &args))
+ return -errno;
+
+ if (unmapped)
+ *unmapped = args.size;
+
+ return 0;
}
-static void iommufd_dma_unmap(struct vfio_pci_device *device,
- struct vfio_dma_region *region)
+static int iommufd_dma_unmap(int fd, u64 iova, u64 length, u32 ioas_id,
+ u64 *unmapped)
{
struct iommu_ioas_unmap args = {
.size = sizeof(args),
- .iova = region->iova,
- .length = region->size,
- .ioas_id = device->ioas_id,
+ .iova = iova,
+ .length = length,
+ .ioas_id = ioas_id,
};
- ioctl_assert(device->iommufd, IOMMU_IOAS_UNMAP, &args);
+ if (ioctl(fd, IOMMU_IOAS_UNMAP, &args))
+ return -errno;
+
+ if (unmapped)
+ *unmapped = args.length;
+
+ return 0;
}
-void vfio_pci_dma_unmap(struct vfio_pci_device *device,
- struct vfio_dma_region *region)
+int __vfio_pci_dma_unmap(struct vfio_pci_device *device,
+ struct vfio_dma_region *region, u64 *unmapped)
{
+ int ret;
+
if (device->iommufd)
- iommufd_dma_unmap(device, region);
+ ret = iommufd_dma_unmap(device->iommufd, region->iova,
+ region->size, device->ioas_id,
+ unmapped);
else
- vfio_iommu_dma_unmap(device, region);
+ ret = vfio_iommu_dma_unmap(device->container_fd, region->iova,
+ region->size, 0, unmapped);
+
+ if (ret)
+ return ret;
+
+ list_del_init(&region->link);
+
+ return 0;
+}
+
+int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped)
+{
+ int ret;
+ struct vfio_dma_region *curr, *next;
+
+ if (device->iommufd)
+ ret = iommufd_dma_unmap(device->iommufd, 0, UINT64_MAX,
+ device->ioas_id, unmapped);
+ else
+ ret = vfio_iommu_dma_unmap(device->container_fd, 0, 0,
+ VFIO_DMA_UNMAP_FLAG_ALL, unmapped);
+
+ if (ret)
+ return ret;
+
+ list_for_each_entry_safe(curr, next, &device->dma_regions, link)
+ list_del_init(&curr->link);
- list_del(&region->link);
+ return 0;
}
static void vfio_pci_region_get(struct vfio_pci_device *device, int index,
diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
index ab19c54a774d..102603d4407d 100644
--- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
+++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
@@ -3,6 +3,8 @@
#include <sys/mman.h>
#include <unistd.h>
+#include <uapi/linux/types.h>
+#include <linux/iommufd.h>
#include <linux/limits.h>
#include <linux/mman.h>
#include <linux/sizes.h>
@@ -93,6 +95,7 @@ static int iommu_mapping_get(const char *bdf, u64 iova,
FIXTURE(vfio_dma_mapping_test) {
struct vfio_pci_device *device;
+ struct iova_allocator *iova_allocator;
};
FIXTURE_VARIANT(vfio_dma_mapping_test) {
@@ -112,13 +115,17 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous, 0, 0);
FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_2mb, SZ_2M, MAP_HUGETLB | MAP_HUGE_2MB);
FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_1gb, SZ_1G, MAP_HUGETLB | MAP_HUGE_1GB);
+#undef FIXTURE_VARIANT_ADD_IOMMU_MODE
+
FIXTURE_SETUP(vfio_dma_mapping_test)
{
self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode);
+ self->iova_allocator = iova_allocator_init(self->device);
}
FIXTURE_TEARDOWN(vfio_dma_mapping_test)
{
+ iova_allocator_cleanup(self->iova_allocator);
vfio_pci_device_cleanup(self->device);
}
@@ -129,6 +136,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap)
struct vfio_dma_region region;
struct iommu_mapping mapping;
u64 mapping_size = size;
+ u64 unmapped;
int rc;
region.vaddr = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
@@ -139,7 +147,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap)
else
ASSERT_NE(region.vaddr, MAP_FAILED);
- region.iova = (u64)region.vaddr;
+ region.iova = iova_allocator_alloc(self->iova_allocator, size);
region.size = size;
vfio_pci_dma_map(self->device, &region);
@@ -184,7 +192,9 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap)
}
unmap:
- vfio_pci_dma_unmap(self->device, &region);
+ rc = __vfio_pci_dma_unmap(self->device, &region, &unmapped);
+ ASSERT_EQ(rc, 0);
+ ASSERT_EQ(unmapped, region.size);
printf("Unmapped IOVA 0x%lx\n", region.iova);
ASSERT_EQ(INVALID_IOVA, __to_iova(self->device, region.vaddr));
ASSERT_NE(0, iommu_mapping_get(device_bdf, region.iova, &mapping));
@@ -192,6 +202,103 @@ unmap:
ASSERT_TRUE(!munmap(region.vaddr, size));
}
+FIXTURE(vfio_dma_map_limit_test) {
+ struct vfio_pci_device *device;
+ struct vfio_dma_region region;
+ size_t mmap_size;
+};
+
+FIXTURE_VARIANT(vfio_dma_map_limit_test) {
+ const char *iommu_mode;
+};
+
+#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \
+FIXTURE_VARIANT_ADD(vfio_dma_map_limit_test, _iommu_mode) { \
+ .iommu_mode = #_iommu_mode, \
+}
+
+FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES();
+
+#undef FIXTURE_VARIANT_ADD_IOMMU_MODE
+
+FIXTURE_SETUP(vfio_dma_map_limit_test)
+{
+ struct vfio_dma_region *region = &self->region;
+ struct iommu_iova_range *ranges;
+ u64 region_size = getpagesize();
+ iova_t last_iova;
+ u32 nranges;
+
+ /*
+ * Over-allocate mmap by double the size to provide enough backing vaddr
+ * for overflow tests
+ */
+ self->mmap_size = 2 * region_size;
+
+ self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode);
+ region->vaddr = mmap(NULL, self->mmap_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(region->vaddr, MAP_FAILED);
+
+ ranges = vfio_pci_iova_ranges(self->device, &nranges);
+ VFIO_ASSERT_NOT_NULL(ranges);
+ last_iova = ranges[nranges - 1].last;
+ free(ranges);
+
+ /* One page prior to the last iova */
+ region->iova = last_iova & ~(region_size - 1);
+ region->size = region_size;
+}
+
+FIXTURE_TEARDOWN(vfio_dma_map_limit_test)
+{
+ vfio_pci_device_cleanup(self->device);
+ ASSERT_EQ(munmap(self->region.vaddr, self->mmap_size), 0);
+}
+
+TEST_F(vfio_dma_map_limit_test, unmap_range)
+{
+ struct vfio_dma_region *region = &self->region;
+ u64 unmapped;
+ int rc;
+
+ vfio_pci_dma_map(self->device, region);
+ ASSERT_EQ(region->iova, to_iova(self->device, region->vaddr));
+
+ rc = __vfio_pci_dma_unmap(self->device, region, &unmapped);
+ ASSERT_EQ(rc, 0);
+ ASSERT_EQ(unmapped, region->size);
+}
+
+TEST_F(vfio_dma_map_limit_test, unmap_all)
+{
+ struct vfio_dma_region *region = &self->region;
+ u64 unmapped;
+ int rc;
+
+ vfio_pci_dma_map(self->device, region);
+ ASSERT_EQ(region->iova, to_iova(self->device, region->vaddr));
+
+ rc = __vfio_pci_dma_unmap_all(self->device, &unmapped);
+ ASSERT_EQ(rc, 0);
+ ASSERT_EQ(unmapped, region->size);
+}
+
+TEST_F(vfio_dma_map_limit_test, overflow)
+{
+ struct vfio_dma_region *region = &self->region;
+ int rc;
+
+ region->iova = ~(iova_t)0 & ~(region->size - 1);
+ region->size = self->mmap_size;
+
+ rc = __vfio_pci_dma_map(self->device, region);
+ ASSERT_EQ(rc, -EOVERFLOW);
+
+ rc = __vfio_pci_dma_unmap(self->device, region, NULL);
+ ASSERT_EQ(rc, -EOVERFLOW);
+}
+
int main(int argc, char *argv[])
{
device_bdf = vfio_selftests_get_bdf(&argc, argv);
diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c
index 2dbd70b7db62..f69eec8b928d 100644
--- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c
+++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c
@@ -19,6 +19,7 @@ static const char *device_bdf;
} while (0)
static void region_setup(struct vfio_pci_device *device,
+ struct iova_allocator *iova_allocator,
struct vfio_dma_region *region, u64 size)
{
const int flags = MAP_SHARED | MAP_ANONYMOUS;
@@ -29,7 +30,7 @@ static void region_setup(struct vfio_pci_device *device,
VFIO_ASSERT_NE(vaddr, MAP_FAILED);
region->vaddr = vaddr;
- region->iova = (u64)vaddr;
+ region->iova = iova_allocator_alloc(iova_allocator, size);
region->size = size;
vfio_pci_dma_map(device, region);
@@ -44,6 +45,7 @@ static void region_teardown(struct vfio_pci_device *device,
FIXTURE(vfio_pci_driver_test) {
struct vfio_pci_device *device;
+ struct iova_allocator *iova_allocator;
struct vfio_dma_region memcpy_region;
void *vaddr;
int msi_fd;
@@ -72,14 +74,15 @@ FIXTURE_SETUP(vfio_pci_driver_test)
struct vfio_pci_driver *driver;
self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode);
+ self->iova_allocator = iova_allocator_init(self->device);
driver = &self->device->driver;
- region_setup(self->device, &self->memcpy_region, SZ_1G);
- region_setup(self->device, &driver->region, SZ_2M);
+ region_setup(self->device, self->iova_allocator, &self->memcpy_region, SZ_1G);
+ region_setup(self->device, self->iova_allocator, &driver->region, SZ_2M);
/* Any IOVA that doesn't overlap memcpy_region and driver->region. */
- self->unmapped_iova = 8UL * SZ_1G;
+ self->unmapped_iova = iova_allocator_alloc(self->iova_allocator, SZ_1G);
vfio_pci_driver_init(self->device);
self->msi_fd = self->device->msi_eventfds[driver->msi];
@@ -108,6 +111,7 @@ FIXTURE_TEARDOWN(vfio_pci_driver_test)
region_teardown(self->device, &self->memcpy_region);
region_teardown(self->device, &driver->region);
+ iova_allocator_cleanup(self->iova_allocator);
vfio_pci_device_cleanup(self->device);
}
diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh
index edacebfc1632..8ceeb8a7894f 100755
--- a/tools/testing/selftests/vsock/vmtest.sh
+++ b/tools/testing/selftests/vsock/vmtest.sh
@@ -389,9 +389,9 @@ run_test() {
local rc
host_oops_cnt_before=$(dmesg | grep -c -i 'Oops')
- host_warn_cnt_before=$(dmesg --level=warn | wc -l)
+ host_warn_cnt_before=$(dmesg --level=warn | grep -c -i 'vsock')
vm_oops_cnt_before=$(vm_ssh -- dmesg | grep -c -i 'Oops')
- vm_warn_cnt_before=$(vm_ssh -- dmesg --level=warn | wc -l)
+ vm_warn_cnt_before=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock')
name=$(echo "${1}" | awk '{ print $1 }')
eval test_"${name}"
@@ -403,7 +403,7 @@ run_test() {
rc=$KSFT_FAIL
fi
- host_warn_cnt_after=$(dmesg --level=warn | wc -l)
+ host_warn_cnt_after=$(dmesg --level=warn | grep -c -i 'vsock')
if [[ ${host_warn_cnt_after} -gt ${host_warn_cnt_before} ]]; then
echo "FAIL: kernel warning detected on host" | log_host "${name}"
rc=$KSFT_FAIL
@@ -415,7 +415,7 @@ run_test() {
rc=$KSFT_FAIL
fi
- vm_warn_cnt_after=$(vm_ssh -- dmesg --level=warn | wc -l)
+ vm_warn_cnt_after=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock')
if [[ ${vm_warn_cnt_after} -gt ${vm_warn_cnt_before} ]]; then
echo "FAIL: kernel warning detected on vm" | log_host "${name}"
rc=$KSFT_FAIL
diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c
index 05e1e6774fba..918eaec8bfbe 100644
--- a/tools/testing/selftests/x86/test_vsyscall.c
+++ b/tools/testing/selftests/x86/test_vsyscall.c
@@ -308,12 +308,13 @@ static void test_getcpu(int cpu)
#ifdef __x86_64__
static jmp_buf jmpbuf;
-static volatile unsigned long segv_err;
+static volatile unsigned long segv_err, segv_trapno;
static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
{
ucontext_t *ctx = (ucontext_t *)ctx_void;
+ segv_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO];
segv_err = ctx->uc_mcontext.gregs[REG_ERR];
siglongjmp(jmpbuf, 1);
}
@@ -336,7 +337,8 @@ static void test_vsys_r(void)
else if (can_read)
ksft_test_result_pass("We have read access\n");
else
- ksft_test_result_pass("We do not have read access: #PF(0x%lx)\n", segv_err);
+ ksft_test_result_pass("We do not have read access (trap=%ld, error=0x%lx)\n",
+ segv_trapno, segv_err);
}
static void test_vsys_x(void)
@@ -347,7 +349,7 @@ static void test_vsys_x(void)
return;
}
- ksft_print_msg("Make sure that vsyscalls really page fault\n");
+ ksft_print_msg("Make sure that vsyscalls really cause a fault\n");
bool can_exec;
if (sigsetjmp(jmpbuf, 1) == 0) {
@@ -358,13 +360,14 @@ static void test_vsys_x(void)
}
if (can_exec)
- ksft_test_result_fail("Executing the vsyscall did not page fault\n");
- else if (segv_err & (1 << 4)) /* INSTR */
- ksft_test_result_pass("Executing the vsyscall page failed: #PF(0x%lx)\n",
- segv_err);
+ ksft_test_result_fail("Executing the vsyscall did not fault\n");
+ /* #GP or #PF (with X86_PF_INSTR) */
+ else if ((segv_trapno == 13) || ((segv_trapno == 14) && (segv_err & (1 << 4))))
+ ksft_test_result_pass("Executing the vsyscall page failed (trap=%ld, error=0x%lx)\n",
+ segv_trapno, segv_err);
else
- ksft_test_result_fail("Execution failed with the wrong error: #PF(0x%lx)\n",
- segv_err);
+ ksft_test_result_fail("Execution failed with the wrong error (trap=%ld, error=0x%lx)\n",
+ segv_trapno, segv_err);
}
/*
diff --git a/tools/thermal/thermal-engine/thermal-engine.c b/tools/thermal/thermal-engine/thermal-engine.c
index 0764dc754771..66b0ba1fcd23 100644
--- a/tools/thermal/thermal-engine/thermal-engine.c
+++ b/tools/thermal/thermal-engine/thermal-engine.c
@@ -374,7 +374,7 @@ int main(int argc, char *argv[])
}
if (options.daemonize && daemon(0, 0)) {
- ERROR("Failed to daemonize: %p\n");
+ ERROR("Failed to daemonize: %m\n");
return THERMAL_ENGINE_DAEMON_ERROR;
}
diff --git a/tools/tracing/latency/latency-collector.c b/tools/tracing/latency/latency-collector.c
index cf263fe9deaf..ef97916e3873 100644
--- a/tools/tracing/latency/latency-collector.c
+++ b/tools/tracing/latency/latency-collector.c
@@ -1725,7 +1725,7 @@ static void show_usage(void)
"-n, --notrace\t\tIf latency is detected, do not print out the content of\n"
"\t\t\tthe trace file to standard output\n\n"
-"-t, --threads NRTHR\tRun NRTHR threads for printing. Default is %d.\n\n"
+"-e, --threads NRTHR\tRun NRTHR threads for printing. Default is %d.\n\n"
"-r, --random\t\tArbitrarily sleep a certain amount of time, default\n"
"\t\t\t%ld ms, before reading the trace file. The\n"